mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-07 09:34:07 +00:00
Compare commits
53 Commits
gg/specula
...
b4227
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0533e7fb38 | ||
|
|
7cc2d2c889 | ||
|
|
b782e5c7d4 | ||
|
|
3a8e9af402 | ||
|
|
a3a3048e7a | ||
|
|
f0678c5ff4 | ||
|
|
4b3242bbea | ||
|
|
0f77aae560 | ||
|
|
266b8519ee | ||
|
|
938f608742 | ||
|
|
f095a649ec | ||
|
|
678d7994f4 | ||
|
|
dc22344088 | ||
|
|
4c0a95b107 | ||
|
|
6c59567689 | ||
|
|
890719311b | ||
|
|
7281cf13ad | ||
|
|
e90688edd0 | ||
|
|
76b27d29c2 | ||
|
|
eea986f215 | ||
|
|
c202cef168 | ||
|
|
2025fa67e9 | ||
|
|
c6bc73951e | ||
|
|
605fa66c50 | ||
|
|
b7420131bf | ||
|
|
9f912511bc | ||
|
|
3ad5451f3b | ||
|
|
46c69e0e75 | ||
|
|
9e2301f4a4 | ||
|
|
fee824a1a1 | ||
|
|
9150f8fef9 | ||
|
|
c31ed2abfc | ||
|
|
5b3466bedf | ||
|
|
249a7902ec | ||
|
|
71a64989a5 | ||
|
|
4a57d362e1 | ||
|
|
c9b00a70b0 | ||
|
|
de5097351c | ||
|
|
5a349f2809 | ||
|
|
30ec398321 | ||
|
|
be0e350c8b | ||
|
|
249cd93da3 | ||
|
|
904109ed0d | ||
|
|
45abe0f74e | ||
|
|
0bbd2262a3 | ||
|
|
ab96610b1e | ||
|
|
7db3846a94 | ||
|
|
c6807b3f28 | ||
|
|
25669aa92c | ||
|
|
84e1c33cde | ||
|
|
811872a59d | ||
|
|
9a4b79bcfa | ||
|
|
7066b4cce2 |
@@ -17,8 +17,10 @@ Checks: >
|
||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||
performance-*,
|
||||
portability-*,
|
||||
-portability-simd-intrinsics,
|
||||
misc-*,
|
||||
-misc-const-correctness,
|
||||
-misc-non-private-member-variables-in-classes,
|
||||
-misc-no-recursion,
|
||||
-misc-use-anonymous-namespace,
|
||||
FormatStyle: none
|
||||
|
||||
@@ -6,6 +6,9 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
# MUSA architecture to build for (defaults to all supported archs)
|
||||
ARG MUSA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
|
||||
@@ -19,7 +22,11 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
# Use the default MUSA archs if not specified
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc) && \
|
||||
cp build/bin/* .
|
||||
|
||||
|
||||
@@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
# MUSA architecture to build for (defaults to all supported archs)
|
||||
ARG MUSA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake
|
||||
|
||||
@@ -15,7 +18,11 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
# Use the default MUSA archs if not specified
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-cli -j$(nproc) && \
|
||||
mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
@@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
# MUSA architecture to build for (defaults to all supported archs)
|
||||
ARG MUSA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
@@ -15,7 +18,11 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
# Use the default MUSA archs if not specified
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-server -j$(nproc) && \
|
||||
mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
@@ -34,7 +34,7 @@ let
|
||||
|
||||
# server tests
|
||||
openai
|
||||
behave
|
||||
pytest
|
||||
prometheus-client
|
||||
];
|
||||
in
|
||||
|
||||
15
.github/labeler.yml
vendored
15
.github/labeler.yml
vendored
@@ -3,19 +3,18 @@ Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-kompute.h
|
||||
- ggml/src/ggml-kompute.cpp
|
||||
- ggml/src/ggml-kompute/**
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-metal.h
|
||||
- ggml/src/ggml-metal.cpp
|
||||
- ggml/src/ggml-metal/**
|
||||
- README-metal.md
|
||||
SYCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-sycl.h
|
||||
- ggml/src/ggml-sycl.cpp
|
||||
- ggml/src/ggml-sycl/**
|
||||
- docs/backend/SYCL.md
|
||||
- examples/sycl/**
|
||||
@@ -27,8 +26,8 @@ Nvidia GPU:
|
||||
Vulkan:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/ggml_vk_generate_shaders.py
|
||||
- ggml/src/ggml-vulkan*
|
||||
- ggml/include/ggml-vulkan.h
|
||||
- ggml/src/ggml-vulkan/**
|
||||
documentation:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -75,11 +74,7 @@ server:
|
||||
ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml*.h
|
||||
- ggml/src/ggml*.c
|
||||
- ggml/src/ggml*.cpp
|
||||
- ggml/src/ggml*.h
|
||||
- ggml-cuda/**
|
||||
- ggml/**
|
||||
nix:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
||||
129
.github/workflows/build.yml
vendored
129
.github/workflows/build.yml
vendored
@@ -728,7 +728,7 @@ jobs:
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-2019
|
||||
runs-on: windows-latest
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
@@ -871,12 +871,33 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-latest-cmake-cuda:
|
||||
ubuntu-latest-cmake-cuda:
|
||||
runs-on: ubuntu-latest
|
||||
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
env:
|
||||
DEBIAN_FRONTEND: noninteractive
|
||||
run: |
|
||||
apt update
|
||||
apt install -y cmake build-essential ninja-build libgomp1 git
|
||||
|
||||
- name: Build with CMake
|
||||
run: |
|
||||
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
|
||||
cmake --build build
|
||||
|
||||
windows-2019-cmake-cuda:
|
||||
runs-on: windows-2019
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.2.0', '11.7.1']
|
||||
cuda: ['12.4', '11.7']
|
||||
build: ['cuda']
|
||||
|
||||
steps:
|
||||
@@ -884,24 +905,83 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install CUDA toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.15
|
||||
- name: Install Cuda Toolkit 11.7
|
||||
if: ${{ matrix.cuda == '11.7' }}
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 12.4
|
||||
if: ${{ matrix.cuda == '12.4' }}
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
method: 'network'
|
||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
@@ -930,10 +1010,12 @@ jobs:
|
||||
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
run: |
|
||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||
echo "Cuda install location: ${{ env.CUDA_PATH }}"
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
@@ -984,7 +1066,7 @@ jobs:
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
@@ -1009,7 +1091,7 @@ jobs:
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
@@ -1039,6 +1121,11 @@ jobs:
|
||||
run: |
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2
|
||||
with:
|
||||
key: ${{ github.job }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@@ -1059,6 +1146,8 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
@@ -1173,7 +1262,7 @@ jobs:
|
||||
- macOS-latest-make
|
||||
- macOS-latest-cmake
|
||||
- windows-latest-cmake
|
||||
- windows-latest-cmake-cuda
|
||||
- windows-2019-cmake-cuda
|
||||
- windows-latest-cmake-hip-release
|
||||
- macOS-latest-cmake-arm64
|
||||
- macOS-latest-cmake-x64
|
||||
|
||||
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
@@ -114,7 +114,7 @@ jobs:
|
||||
swap-storage: true
|
||||
|
||||
- name: Build and push Docker image (tagged + versioned)
|
||||
if: github.event_name == 'push'
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
|
||||
72
.github/workflows/nix-ci-aarch64.yml
vendored
72
.github/workflows/nix-ci-aarch64.yml
vendored
@@ -1,72 +0,0 @@
|
||||
name: Nix aarch64 builds
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
schedule:
|
||||
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
||||
# 1.5h instead of minutes with the cold cache).
|
||||
#
|
||||
# randint(0, 59), randint(0, 23)
|
||||
- cron: '26 12 * * *'
|
||||
# But also rebuild if we touched any of the Nix expressions:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['**/*.nix', 'flake.lock']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/*.nix', 'flake.lock']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
nix-build-aarch64:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install QEMU
|
||||
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
||||
sudo usermod -a -G kvm $USER
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-platforms = aarch64-linux
|
||||
extra-system-features = nixos-test kvm
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: llama-cpp
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.aarch64-linux"
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--systems aarch64-linux
|
||||
--flake
|
||||
".#checks.aarch64-linux"
|
||||
79
.github/workflows/nix-ci.yml
vendored
79
.github/workflows/nix-ci.yml
vendored
@@ -1,79 +0,0 @@
|
||||
name: Nix CI
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
nix-eval:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ ubuntu-latest, macos-latest ]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: List all flake outputs
|
||||
run: nix flake show --all-systems
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
nix-build:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ ubuntu-latest, macos-latest ]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: llama-cpp
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--flake
|
||||
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
22
.github/workflows/nix-flake-update.yml
vendored
22
.github/workflows/nix-flake-update.yml
vendored
@@ -1,22 +0,0 @@
|
||||
name: update-flake-lock
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
||||
|
||||
jobs:
|
||||
lockfile:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@main
|
||||
- name: Update flake.lock
|
||||
uses: DeterminateSystems/update-flake-lock@main
|
||||
with:
|
||||
pr-title: "nix: update flake.lock"
|
||||
pr-labels: |
|
||||
nix
|
||||
pr-reviewers: philiptaron,SomeoneSerge
|
||||
token: ${{ secrets.FLAKE_TOKEN }}
|
||||
36
.github/workflows/nix-publish-flake.yml
vendored
36
.github/workflows/nix-publish-flake.yml
vendored
@@ -1,36 +0,0 @@
|
||||
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
||||
name: "Publish a flake to flakestry & flakehub"
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "*"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "The existing tag to publish"
|
||||
type: "string"
|
||||
required: true
|
||||
jobs:
|
||||
flakestry-publish:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: flakestry/flakestry-publish@main
|
||||
with:
|
||||
version: "${{ inputs.tag || github.ref_name }}"
|
||||
flakehub-publish:
|
||||
runs-on: "ubuntu-latest"
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: "actions/checkout@v4"
|
||||
with:
|
||||
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
||||
- uses: "DeterminateSystems/nix-installer-action@main"
|
||||
- uses: "DeterminateSystems/flakehub-push@main"
|
||||
with:
|
||||
visibility: "public"
|
||||
tag: "${{ inputs.tag }}"
|
||||
9
.github/workflows/python-lint.yml
vendored
9
.github/workflows/python-lint.yml
vendored
@@ -1,6 +1,13 @@
|
||||
name: flake8 Lint
|
||||
|
||||
on: [push, pull_request]
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
|
||||
9
.github/workflows/server.yml
vendored
9
.github/workflows/server.yml
vendored
@@ -122,14 +122,14 @@ jobs:
|
||||
id: server_integration_tests
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
PORT=8888 ./tests.sh
|
||||
./tests.sh
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
||||
SLOW_TESTS=1 ./tests.sh
|
||||
|
||||
|
||||
server-windows:
|
||||
@@ -180,11 +180,12 @@ jobs:
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
pytest -v -x
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
behave.exe --stop --no-skipped --no-capture --tags slow
|
||||
$env:SLOW_TESTS = "1"
|
||||
pytest -v -x
|
||||
|
||||
186
AUTHORS
186
AUTHORS
@@ -1,4 +1,4 @@
|
||||
# date: Wed Jun 26 19:36:34 EEST 2024
|
||||
# date: Thu Nov 28 20:46:15 EET 2024
|
||||
# this file is auto-generated by scripts/gen-authors.sh
|
||||
|
||||
0cc4m <picard12@live.de>
|
||||
@@ -7,6 +7,7 @@
|
||||
2f38b454 <dxf@protonmail.com>
|
||||
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||
44670 <44670@users.noreply.github.com>
|
||||
65a <10104049+65a@users.noreply.github.com>
|
||||
AN Long <aisk@users.noreply.github.com>
|
||||
AT <manyoso@users.noreply.github.com>
|
||||
Aarni Koskela <akx@iki.fi>
|
||||
@@ -19,20 +20,28 @@ Adithya Balaji <adithya.b94@gmail.com>
|
||||
AdithyanI <adithyan.i4internet@gmail.com>
|
||||
Adrian <smith.adriane@gmail.com>
|
||||
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
|
||||
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
|
||||
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||
AidanBeltonS <aidan.belton@codeplay.com>
|
||||
Aisuko <urakiny@gmail.com>
|
||||
Akarshan Biswas <akarshan.biswas@gmail.com>
|
||||
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
||||
Al Mochkin <14274697+amochkin@users.noreply.github.com>
|
||||
Albert Jin <albert.jin@gmail.com>
|
||||
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
|
||||
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
|
||||
Alex <awhill19@icloud.com>
|
||||
Alex Azarov <alex@azarov.by>
|
||||
Alex Azarov <alexander.azarov@mapbox.com>
|
||||
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||
Alex Klinkhamer <git@grencez.dev>
|
||||
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||
Alex O'Connell <35843486+acon96@users.noreply.github.com>
|
||||
Alex Petenchea <alex.petenchea@gmail.com>
|
||||
Alex Renda <alexrenda@users.noreply.github.com>
|
||||
Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
|
||||
Alex von Gluck IV <kallisti5@unixzen.com>
|
||||
Alexey Parfenov <zxed@alkatrazstudio.net>
|
||||
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
||||
@@ -45,18 +54,25 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||
Ananta Bastola <anantarajbastola@gmail.com>
|
||||
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||
András Salamon <ott2@users.noreply.github.com>
|
||||
Andreas (Andi) Kunar <andreask@msn.com>
|
||||
Andrei <abetlen@gmail.com>
|
||||
Andrew Canis <andrew.canis@gmail.com>
|
||||
Andrew Downing <andrew2085@gmail.com>
|
||||
Andrew Duffy <a10y@users.noreply.github.com>
|
||||
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||
Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
|
||||
Andy Salerno <andysalerno@gmail.com>
|
||||
Andy Tai <andy-tai@users.noreply.github.com>
|
||||
Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
|
||||
Antonis Makropoulos <benuix@gmail.com>
|
||||
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||
Armen Kaleshian <kriation@users.noreply.github.com>
|
||||
Artem <guinmoon@gmail.com>
|
||||
Artem Zinnatullin <ceo@abstractny.gay>
|
||||
Artyom Lebedev <vagran.ast@gmail.com>
|
||||
Asbjørn Olling <asbjornolling@gmail.com>
|
||||
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||
Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
|
||||
Ashish <1856117+ashishdatta@users.noreply.github.com>
|
||||
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||
Ashraful Islam <ashraful.meche@gmail.com>
|
||||
@@ -76,12 +92,16 @@ Ben Williams <ben@719ben.com>
|
||||
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
|
||||
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||
Bernat Vadell <hounter.caza@gmail.com>
|
||||
Bert Wagner <github@bertwagner.com>
|
||||
Bingan <70050083+binganao@users.noreply.github.com>
|
||||
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
|
||||
Bodo Graumann <mail@bodograumann.de>
|
||||
Bono Lv <lvscar@users.noreply.github.com>
|
||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||
Branden Butler <bwtbutler@hotmail.com>
|
||||
Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
|
||||
Brian <mofosyne@gmail.com>
|
||||
Brian Cunnie <brian.cunnie@gmail.com>
|
||||
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
Bryan Honof <bryanhonof@gmail.com>
|
||||
CJ Pais <cj@cjpais.com>
|
||||
@@ -90,32 +110,47 @@ Calvin Laurenson <calvin@laurenson.dev>
|
||||
Cameron <csteele@steelecameron.com>
|
||||
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
||||
CarryFun <76023481+CarryFun@users.noreply.github.com>
|
||||
Carsten Kragelund Jørgensen <carsten@kragelund.me>
|
||||
CarterLi999 <664681047@qq.com>
|
||||
Casey Primozic <casey@cprimozic.net>
|
||||
Casey Primozic <me@ameo.link>
|
||||
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||
Cebtenzzre <cebtenzzre@gmail.com>
|
||||
Chad Brewbaker <crb002@gmail.com>
|
||||
Changyeon Kim <cyzero.kim@samsung.com>
|
||||
Chao Jiang <jc19chaoj@zoho.com>
|
||||
Charles Xu <63788048+chaxu01@users.noreply.github.com>
|
||||
Charles Xu <charles.xu@arm.com>
|
||||
Chen Xi <xi2.chen@intel.com>
|
||||
Chen Xi <xixichen08@foxmail.com>
|
||||
Cheng Shao <terrorjack@type.dance>
|
||||
Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
|
||||
Chris Elrod <elrodc@gmail.com>
|
||||
Chris Kuehl <ckuehl@ckuehl.me>
|
||||
Christian Demsar <christian@github.email.demsar.us>
|
||||
Christian Demsar <crasm@git.vczf.us>
|
||||
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||
Christian Kögler <ck3d@gmx.de>
|
||||
Christian Köhnenkamp <cvk5@me.com>
|
||||
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
||||
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||
Clint Herron <hanclinto@gmail.com>
|
||||
Conrad Kramer <conrad@conradkramer.com>
|
||||
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
|
||||
Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
|
||||
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||
DAN™ <dranger003@gmail.com>
|
||||
Damian Stewart <d@damianstewart.com>
|
||||
Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
|
||||
Dan Johansson <dan.johansson@arm.com>
|
||||
Dane Madsen <dane_madsen@hotmail.com>
|
||||
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
||||
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||
Daniel Drake <drake@endlessos.org>
|
||||
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||
Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
|
||||
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
||||
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||
@@ -129,19 +164,28 @@ David Pflug <david@pflug.email>
|
||||
David Renshaw <dwrenshaw@gmail.com>
|
||||
David Sommers <12738+databyte@users.noreply.github.com>
|
||||
David Yang <davidyang6us@gmail.com>
|
||||
DavidKorczynski <david@adalogics.com>
|
||||
Dawid Potocki <github@dawidpotocki.com>
|
||||
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||
Dean <Dean.Sinaean@gmail.com>
|
||||
Deins <deinsegle@gmail.com>
|
||||
Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
|
||||
Derrick T. Woolworth <dwoolworth@gmail.com>
|
||||
Deven Mistry <31466137+deven367@users.noreply.github.com>
|
||||
Dibakar Gope <dibakar.gope@arm.com>
|
||||
Didzis Gosko <didzis@users.noreply.github.com>
|
||||
Diego Devesa <slarengh@gmail.com>
|
||||
Diogo Teles Sant'Anna <diogoteles@google.com>
|
||||
Djip007 <djip.perois@free.fr>
|
||||
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||
DooWoong Lee (David) <manics99@naver.com>
|
||||
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||
Dou Xinpeng <15529241576@163.com>
|
||||
Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
|
||||
Douglas Hanley <thesecretaryofwar@gmail.com>
|
||||
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||
Ebey Abraham <ebey97@gmail.com>
|
||||
Echo Nolan <echo@echonolan.net>
|
||||
Ed Lee <edilee@mozilla.com>
|
||||
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||
Eddie-Wang <wangjinheng1120@163.com>
|
||||
@@ -151,10 +195,13 @@ Elbios <141279586+Elbios@users.noreply.github.com>
|
||||
Elton Kola <eltonkola@gmail.com>
|
||||
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||
Equim <sayaka@ekyu.moe>
|
||||
Eric Curtin <ecurtin@redhat.com>
|
||||
Eric Curtin <ericcurtin17@gmail.com>
|
||||
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
||||
Erik Garrison <erik.garrison@gmail.com>
|
||||
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||
Esko Toivonen <eskot98@gmail.com>
|
||||
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
||||
Evan Jones <evan.q.jones@gmail.com>
|
||||
Evan Miller <emmiller@gmail.com>
|
||||
@@ -166,19 +213,26 @@ FK <sozforex@gmail.com>
|
||||
Fabian <cmdrf@users.noreply.github.com>
|
||||
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
||||
Faez Shakil <faez.shakil@gmail.com>
|
||||
Faisal Zaghloul <faisal.zaghloul@gmail.com>
|
||||
Faisal Zaghloul <quic_fzaghlou@quicinc.com>
|
||||
Fan Shupei <dymarkfan@outlook.com>
|
||||
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
||||
Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
|
||||
Fattire <528174+fat-tire@users.noreply.github.com>
|
||||
Felix <stenbackfelix@gmail.com>
|
||||
Finn Voorhees <finnvoorhees@gmail.com>
|
||||
Firat <firatkiral@gmail.com>
|
||||
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
|
||||
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||
Frank Mai <thxcode0824@gmail.com>
|
||||
FrankHB <frankhb1989@gmail.com>
|
||||
Frankie Robertson <frankier@users.noreply.github.com>
|
||||
Fred Douglas <43351173+fredlas@users.noreply.github.com>
|
||||
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||
Gabe Goodhart <ghart@us.ibm.com>
|
||||
GainLee <perfecter.gen@gmail.com>
|
||||
Galunid <karolek1231456@gmail.com>
|
||||
Gary Linscott <glinscott@gmail.com>
|
||||
@@ -187,11 +241,13 @@ Gavin Zhao <gavinzhaojw@protonmail.com>
|
||||
Genkagaku.GPT <hlhr202@163.com>
|
||||
Georgi Gerganov <ggerganov@gmail.com>
|
||||
Gilad S <giladgd@users.noreply.github.com>
|
||||
Gilad S. <7817232+giladgd@users.noreply.github.com>
|
||||
Giuseppe Scrivano <giuseppe@scrivano.org>
|
||||
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||
Govlzkoy <gotope@users.noreply.github.com>
|
||||
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||
Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
|
||||
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||
Haggai Nuchi <h.nuchi@gmail.com>
|
||||
@@ -213,11 +269,14 @@ Hong Bo PENG <penghb@cn.ibm.com>
|
||||
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||
Howard Su <howard0su@gmail.com>
|
||||
Hua Jiang <allenhjiang@outlook.com>
|
||||
Huang Qi <huangqi3@xiaomi.com>
|
||||
Huawei Lin <huaweilin.cs@gmail.com>
|
||||
Hugo Roussel <hugo.rous@gmail.com>
|
||||
Huifeng Ou <79071290+ho2103@users.noreply.github.com>
|
||||
Ian Bull <irbull@eclipsesource.com>
|
||||
Ian Bull <irbull@gmail.com>
|
||||
Ian Scrivener <github@zilogy.asia>
|
||||
Icecream95 <the.real.icecream95@gmail.com>
|
||||
Ido S <ido.pluto@gmail.com>
|
||||
IgnacioFDM <ignaciofdm@gmail.com>
|
||||
Igor Okulist <okigan@gmail.com>
|
||||
@@ -226,11 +285,15 @@ Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
||||
Ionoclast Laboratories <brigham@ionoclast.com>
|
||||
Isaac McFadyen <isaac@imcf.me>
|
||||
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
||||
Ivan <nekotekina@gmail.com>
|
||||
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
|
||||
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||
Jack Mousseau <jack@software.inc>
|
||||
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||
Jaeden Amero <jaeden@patater.com>
|
||||
Jaemin Son <woalsdnd@gmail.com>
|
||||
Jag Chadha <jagtesh@gmail.com>
|
||||
Jakub N <jakubniemczyk97@gmail.com>
|
||||
@@ -243,10 +306,14 @@ Jannis Schönleber <joennlae@gmail.com>
|
||||
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||
Jared Van Bortel <jared@nomic.ai>
|
||||
Jason McCartney <jmac@theroot.org>
|
||||
Jason Stillerman <jason.t.stillerman@gmail.com>
|
||||
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||
Jed Fox <git@jedfox.com>
|
||||
Jeff Bolz <jbolz@nvidia.com>
|
||||
Jeffrey Morgan <jmorganca@gmail.com>
|
||||
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||
Jeroen Mostert <jeroen.mostert@cm.com>
|
||||
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||
Jeximo <jeximo@gmail.com>
|
||||
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||
@@ -258,6 +325,9 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||
Jiří Sejkora <Sejseloid@gmail.com>
|
||||
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
||||
Joan Fontanals <joan.fontanals.martinez@jina.ai>
|
||||
João Dinis Ferreira <hello@joaof.eu>
|
||||
Joe Eli McIlvain <joe.eli.mac@gmail.com>
|
||||
Joe Todd <joe.todd@codeplay.com>
|
||||
Johan <JohanAR@users.noreply.github.com>
|
||||
Johannes Gäßler <johannesg@5d6.de>
|
||||
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||
@@ -274,7 +344,9 @@ Joyce <joycebrum@google.com>
|
||||
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||
Judd <foldl@users.noreply.github.com>
|
||||
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||
Jun Hee Yoo <contact.jhyoo@gmail.com>
|
||||
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||
Junil Kim <logyourself@gmail.com>
|
||||
Junyang Lin <justinlin930319@hotmail.com>
|
||||
Juraj Bednar <juraj@bednar.io>
|
||||
Justin Parker <jparkerweb@gmail.com>
|
||||
@@ -292,12 +364,14 @@ Karthik Sethuraman <k.seth1993@gmail.com>
|
||||
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||
Keke Han <hankeke303@163.com>
|
||||
Kenvix ⭐ <kenvixzure@live.com>
|
||||
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||
Kevin Gibbons <bakkot@gmail.com>
|
||||
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||
Kevin Kwok <antimatter15@gmail.com>
|
||||
Kevin Lo <kevlo@kevlo.org>
|
||||
Kevin Wang <kevmo314@gmail.com>
|
||||
Kolen Cheung <ickc@users.noreply.github.com>
|
||||
Konstantin Herud <konstantin.herud@denkbares.com>
|
||||
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||
@@ -315,22 +389,29 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||
Li Tan <tanliboy@gmail.com>
|
||||
Linwei Wang <wanix1988@gmail.com>
|
||||
Liu Jia <109258120+Septa2112@users.noreply.github.com>
|
||||
Liu Jia <jia3.liu@intel.com>
|
||||
LoganDark <github@logandark.mozmail.com>
|
||||
Loïc Carrère <loic.carrere@gmail.com>
|
||||
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||
Luciano <lucianostrika44@gmail.com>
|
||||
Luo Tian <lt@basecity.com>
|
||||
Lyle Dean <dean@lyle.dev>
|
||||
M-A <maruel@gmail.com>
|
||||
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||
Ma Mingfei <mingfei.ma@intel.com>
|
||||
Maarten ter Huurne <maarten@treewalker.org>
|
||||
Mack Straight <eiz@users.noreply.github.com>
|
||||
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||
MaggotHATE <clay1326@gmail.com>
|
||||
Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
|
||||
Manuel <44313466+makuche@users.noreply.github.com>
|
||||
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||
Marian Cepok <marian.cepok@gmail.com>
|
||||
Mark Fairbairn <thebaron88@gmail.com>
|
||||
Mark Zhuang <zhuangqiubin@gmail.com>
|
||||
Marko Tasic <mtasic85@gmail.com>
|
||||
Markus Tavenrath <mtavenrath@users.noreply.github.com>
|
||||
Martin Delille <martin@delille.org>
|
||||
@@ -342,11 +423,15 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
|
||||
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||
Matheus C. França <matheus-catarino@hotmail.com>
|
||||
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||
Mathieu Geli <mathieu.geli@gmail.com>
|
||||
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
||||
Mathijs Henquet <mathijs.henquet@gmail.com>
|
||||
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||
Matt Pulver <matt.pulver@heavy.ai>
|
||||
Matt Stephenson <mstephenson6@users.noreply.github.com>
|
||||
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||
Matteo Mortari <matteo.mortari@gmail.com>
|
||||
Mattheus Chediak <shammcity00@gmail.com>
|
||||
Matthew Tejo <matthew.tejo@gmail.com>
|
||||
Matvey Soloviev <blackhole89@gmail.com>
|
||||
@@ -356,8 +441,10 @@ Maxime <672982+maximegmd@users.noreply.github.com>
|
||||
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||
Meng Zhang <meng@tabbyml.com>
|
||||
Meng, Hengyu <hengyu.meng@intel.com>
|
||||
Mengqing Cao <cmq0113@163.com>
|
||||
Merrick Christensen <merrick.christensen@gmail.com>
|
||||
Michael Coppola <m18coppola@gmail.com>
|
||||
Michael Francis <edude03@gmail.com>
|
||||
Michael Hueschen <m@mhueschen.dev>
|
||||
Michael Kesper <mkesper@schokokeks.org>
|
||||
Michael Klimenko <mklimenko29@gmail.com>
|
||||
@@ -365,41 +452,57 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||
Michael Potter <NanoTekGuy@Gmail.com>
|
||||
Michael de Gans <michael.john.degans@gmail.com>
|
||||
Michaël de Vries <vriesdemichael@gmail.com>
|
||||
Michał Tuszyński <srgtuszy@gmail.com>
|
||||
Mihai <mihai.chirculescu@yahoo.com>
|
||||
Mike <ytianhui2004@gmail.com>
|
||||
Mikko Juola <mikjuo@gmail.com>
|
||||
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||
Minsoo Cheong <icycle0409@snu.ac.kr>
|
||||
Mirko185 <mirkosig@gmail.com>
|
||||
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||
MistApproach <98988043+MistApproach@users.noreply.github.com>
|
||||
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
||||
Molly Sophia <mollysophia379@gmail.com>
|
||||
MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
|
||||
Murilo Santana <mvrilo@gmail.com>
|
||||
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||
Nathan Epstein <nate2@umbc.edu>
|
||||
Natsu <chino@hotococoa.moe>
|
||||
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||
Nebula <infinitewormhole@gmail.com>
|
||||
Neo Zhang <14088817+arthw@users.noreply.github.com>
|
||||
Neo Zhang <zhang.jianyu@outlook.com>
|
||||
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||
Neuman Vong <neuman.vong@gmail.com>
|
||||
Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
|
||||
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||
Nicholai Tukanov <nicholaitukanov@gmail.com>
|
||||
Nico Bosshard <nico@bosshome.ch>
|
||||
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||
Nicolás Pérez <nicolas_perez@brown.edu>
|
||||
Nigel Bosch <pnigelb@gmail.com>
|
||||
Niklas Korz <niklas@niklaskorz.de>
|
||||
NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
|
||||
Nikolas <127742645+nneubacher@users.noreply.github.com>
|
||||
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||
OSecret <135510162+OLSecret@users.noreply.github.com>
|
||||
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||
Ondřej Čertík <ondrej@certik.us>
|
||||
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||
PAB <pierreantoine.bannier@gmail.com>
|
||||
Pablo Duboue <pablo.duboue@gmail.com>
|
||||
Pascal Patry <ppatry@mtacitlabs.com>
|
||||
Patrice Ferlet <metal3d@gmail.com>
|
||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||
Pavel Zloi <github.com@drteam.rocks>
|
||||
Pavol Rusnak <pavol@rusnak.io>
|
||||
Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
|
||||
Pedro Cuenca <pedro@huggingface.co>
|
||||
Peter Sugihara <peter@campsh.com>
|
||||
Phil H <5756783+phiharri@users.noreply.github.com>
|
||||
@@ -407,10 +510,15 @@ Philip Taron <philip.taron@gmail.com>
|
||||
Phillip Kravtsov <phillip@kravtsov.net>
|
||||
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
||||
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
||||
Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
|
||||
Plamen Minev <pacominev@gmail.com>
|
||||
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
|
||||
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||
Qingyou Meng <meng.qingyou@gmail.com>
|
||||
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||
R0CKSTAR <xiaodong.ye@mthreads.com>
|
||||
R0CKSTAR <yeahdongcn@gmail.com>
|
||||
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||
Radoslav Gerganov <rgerganov@gmail.com>
|
||||
Radosław Gryta <radek.gryta@gmail.com>
|
||||
@@ -419,11 +527,13 @@ Raj Hammeer Singh Hada <hammeerraj@gmail.com>
|
||||
Ralph Soika <ralph.soika@imixs.com>
|
||||
Rand Xie <randxiexyy29@gmail.com>
|
||||
Randall Fitzgerald <randall@dasaku.net>
|
||||
Random Fly <renfei8@live.cn>
|
||||
Reinforce-II <fate@eastal.com>
|
||||
Ren Xuancheng <jklj077@users.noreply.github.com>
|
||||
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
||||
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
||||
Riceball LEE <snowyu.lee@gmail.com>
|
||||
Rich Dougherty <rich@rd.nz>
|
||||
Richard Kiss <him@richardkiss.com>
|
||||
Richard Roberson <richardr1126@gmail.com>
|
||||
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||
@@ -439,21 +549,30 @@ Robey Holderith <robey@flaminglunchbox.net>
|
||||
Robyn <robyngraf@users.noreply.github.com>
|
||||
Roger Meier <r.meier@siemens.com>
|
||||
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||
Romain Biessy <romain.biessy@codeplay.com>
|
||||
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||
Romain Neutron <romain@neutron.io>
|
||||
Roman Parykin <donderom@gmail.com>
|
||||
Ron Evans <ron@hybridgroup.com>
|
||||
Ron Jailall <rojailal@gmail.com>
|
||||
Roni <sulpher@gmx.net>
|
||||
Ronny Brendel <ronnybrendel@gmail.com>
|
||||
Ronsor <ronsor@ronsor.pw>
|
||||
Rowan Hart <rowanbhart@gmail.com>
|
||||
Ruchira Hasaranga <ruchira66@gmail.com>
|
||||
Ruixin Huang <18860020911@163.com>
|
||||
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||
RunningLeon <maningsheng@sensetime.com>
|
||||
RunningLeon <mnsheng@yeah.net>
|
||||
Ryan Landay <rlanday@gmail.com>
|
||||
Ryder Wishart <ryderwishart@gmail.com>
|
||||
Ryuei <louixs@users.noreply.github.com>
|
||||
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
|
||||
SXX <sxx1136965276@gmail.com>
|
||||
SakuraUmi <yukinon244@gmail.com>
|
||||
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||
Salvatore Mesoraca <s.mesoraca16@gmail.com>
|
||||
Sam Spilsbury <smspillaz@gmail.com>
|
||||
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
||||
Samuel Maynard <samwmaynard@gmail.com>
|
||||
@@ -463,23 +582,29 @@ Sebastián A <sebastian.aedo29@gmail.com>
|
||||
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||
Sergey Alirzaev <zl29ah@gmail.com>
|
||||
Sergio López <slp@redhat.com>
|
||||
Sergio López <slp@sinrega.org>
|
||||
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
|
||||
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||
Shane A <shanea@allenai.org>
|
||||
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||
Shankar <gshankar.87@gmail.com>
|
||||
Shanshan Shen <467638484@qq.com>
|
||||
Shijie <821898965@qq.com>
|
||||
Shintarou Okada <kokuzen@gmail.com>
|
||||
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||
Shuichi Tsutsumi <shuichi0526@gmail.com>
|
||||
Shupei Fan <dymarkfan@outlook.com>
|
||||
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||
Simon Willison <swillison@gmail.com>
|
||||
Siwen Yu <yusiwen@gmail.com>
|
||||
Sky Yan <skyan83@gmail.com>
|
||||
Slaren <2141330+slaren@users.noreply.github.com>
|
||||
Slava Primenko <primenko.s@gmail.com>
|
||||
Small Grass Forest <zixuanxcl@gmail.com>
|
||||
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
||||
Someone <sergei.kozlukov@aalto.fi>
|
||||
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||
@@ -491,12 +616,15 @@ Stefan Sydow <stefan@sydow.email>
|
||||
Steffen Röcker <sroecker@gmail.com>
|
||||
Stephan Walter <stephan@walter.name>
|
||||
Stephen Nichols <snichols@users.noreply.github.com>
|
||||
Steve Bonds <sbonds@gmail.com>
|
||||
Steve Grubb <ausearch.1@gmail.com>
|
||||
Steven Prichard <spprichard20@gmail.com>
|
||||
Steven Roussey <sroussey@gmail.com>
|
||||
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||
StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
|
||||
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||
SuperUserNameMan <yoann@terminajones.com>
|
||||
Sutou Kouhei <kou@cozmixng.org>
|
||||
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
||||
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
||||
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
||||
@@ -507,7 +635,9 @@ Theia Vogel <theia@vgel.me>
|
||||
Thérence <13496987+Royalphax@users.noreply.github.com>
|
||||
Thibault Terrasson <thibault.terrasson@gmail.com>
|
||||
Thomas Klausner <wiz@gatalith.at>
|
||||
Thorsten Sommer <SommerEngineering@users.noreply.github.com>
|
||||
Tim Miller <drasticactions@users.noreply.github.com>
|
||||
Tim Wang <overocean@gmail.com>
|
||||
Timmy Knight <r2d2fish@gmail.com>
|
||||
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||
Ting Lou <ting.lou@gmail.com>
|
||||
@@ -517,24 +647,31 @@ Tom C <tom.corelis@gmail.com>
|
||||
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||
Tomas <tom.tomas.36478119@gmail.com>
|
||||
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||
Tony Wasserka <4840017+neobrain@users.noreply.github.com>
|
||||
Tristan Druyen <tristan@vault81.mozmail.com>
|
||||
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||
Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
|
||||
Tungsten842 <886724vf@anonaddy.me>
|
||||
Tungsten842 <quantmint@protonmail.com>
|
||||
Tushar <ditsuke@protonmail.com>
|
||||
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||
Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
|
||||
Ulrich Drepper <drepper@gmail.com>
|
||||
Uzo Nweke <uzoechi@gmail.com>
|
||||
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||
Val Kharitonov <mail@kharvd.com>
|
||||
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||
Vali Malinoiu <0x4139@gmail.com>
|
||||
Victor Nogueira <felladrin@gmail.com>
|
||||
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||
Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
|
||||
Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
|
||||
Vlad <spitfireage@gmail.com>
|
||||
Vladimir <bogdad@gmail.com>
|
||||
Vladimir Malyutin <first-leon@yandex.ru>
|
||||
Vladimir Zorin <vladimir@deviant.guru>
|
||||
VoidIsVoid <343750470@qq.com>
|
||||
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||
Weird Constructor <weirdconstructor@gmail.com>
|
||||
@@ -551,15 +688,22 @@ Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||
Xiaoyi Chen <cxychina@gmail.com>
|
||||
Xie Yanbo <xieyanbo@gmail.com>
|
||||
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
|
||||
Xuan Son Nguyen <thichthat@gmail.com>
|
||||
Yaiko <elyaiko@hotmail.com>
|
||||
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||
Yaroslav <yaroslav.yashin@me.com>
|
||||
Yazan Agha-Schrader <mountaiin@icloud.com>
|
||||
Yiming Cui <conandiy@vip.qq.com>
|
||||
Yishuo Wang <MeouSker77@outlook.com>
|
||||
Yoshi Suhara <y.suhara@gmail.com>
|
||||
Yoshi Suhara <ysuhara@nvidia.com>
|
||||
Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
|
||||
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||
Yui <dev@sleepyyui.com>
|
||||
Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
|
||||
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
||||
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
||||
ZHAOKAI WANG <sanxianwei@163.com>
|
||||
@@ -568,6 +712,8 @@ Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||
Zenix <zenixls2@gmail.com>
|
||||
Zhang Peiyuan <a1286225768@gmail.com>
|
||||
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
|
||||
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
|
||||
Zhiyuan Li <lizhiyuan@uniartisan.com>
|
||||
ZhouYuChen <zhouyuchen@naver.com>
|
||||
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||
@@ -581,6 +727,7 @@ alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||
alonfaraj <alonfaraj@gmail.com>
|
||||
alwqx <kenan3015@gmail.com>
|
||||
amd-lalithnc <lalithnc@amd.com>
|
||||
amritahs-ibm <amritahs@linux.vnet.ibm.com>
|
||||
andrijdavid <david@geek.mg>
|
||||
anon998 <131767832+anon998@users.noreply.github.com>
|
||||
anzz1 <anzz1@live.com>
|
||||
@@ -588,14 +735,18 @@ apaz <aarpazdera@gmail.com>
|
||||
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||
arch-btw <57669023+arch-btw@users.noreply.github.com>
|
||||
arcrank <arcrank@gmail.com>
|
||||
ardfork <134447697+ardfork@users.noreply.github.com>
|
||||
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||
at8u <129688334+at8u@users.noreply.github.com>
|
||||
automaticcat <daogiatuank54@gmail.com>
|
||||
awatuna <23447591+awatuna@users.noreply.github.com>
|
||||
b4b4o <zwbao@foxmail.com>
|
||||
bandoti <141645996+bandoti@users.noreply.github.com>
|
||||
beiller <beiller@gmail.com>
|
||||
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
||||
bmwl <brian.marshall@tolko.com>
|
||||
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||
brucepro <git@brucepro.net>
|
||||
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
||||
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
||||
bssrdf <merlintiger@hotmail.com>
|
||||
@@ -614,10 +765,14 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com>
|
||||
crasm <crasm@git.vczf.net>
|
||||
crasm <crasm@git.vczf.us>
|
||||
daboe01 <daboe01@googlemail.com>
|
||||
daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
|
||||
daminho <37615795+daminho@users.noreply.github.com>
|
||||
david raistrick <keen99@users.noreply.github.com>
|
||||
ddh0 <dylanhalladay02@icloud.com>
|
||||
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||
devojony <61173062+devojony@users.noreply.github.com>
|
||||
ditsuke <ditsuke@protonmail.com>
|
||||
divinity76 <divinity76@gmail.com>
|
||||
dm4 <sunrisedm4@gmail.com>
|
||||
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||
@@ -629,14 +784,18 @@ ebraminio <ebraminio@gmail.com>
|
||||
eiery <19350831+eiery@users.noreply.github.com>
|
||||
eric8607242 <e0928021388@gmail.com>
|
||||
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
||||
fengerhu1 <2748250768@qq.com>
|
||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||
gliptic <gliptic@users.noreply.github.com>
|
||||
goerch <jhr.walter@t-online.de>
|
||||
grahameth <96447521+grahameth@users.noreply.github.com>
|
||||
gtygo <gtydoit@gmail.com>
|
||||
gwjr <502526+gwjr@users.noreply.github.com>
|
||||
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||
hankcs <cnhankmc@gmail.com>
|
||||
haopeng <657407891@qq.com>
|
||||
hipudding <huafengchun@gmail.com>
|
||||
hoangmit <hoangmit@users.noreply.github.com>
|
||||
hongbo.mo <352280764@qq.com>
|
||||
hopkins385 <98618192+hopkins385@users.noreply.github.com>
|
||||
@@ -649,12 +808,14 @@ hxer7963 <hxer7963@gmail.com>
|
||||
hydai <z54981220@gmail.com>
|
||||
iSma <ismail.senhaji@gmail.com>
|
||||
iacore <74560659+iacore@users.noreply.github.com>
|
||||
icppWorld <124377669+icppWorld@users.noreply.github.com>
|
||||
igarnier <igarnier@protonmail.com>
|
||||
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
||||
iohub <rickyang.pro@gmail.com>
|
||||
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
||||
jameswu2014 <545426914@qq.com>
|
||||
jdomke <28772296+jdomke@users.noreply.github.com>
|
||||
jiez <373447296@qq.com>
|
||||
jneem <joeneeman@gmail.com>
|
||||
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
||||
@@ -677,28 +838,35 @@ klosax <131523366+klosax@users.noreply.github.com>
|
||||
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
||||
kunnis <kunnis@users.noreply.github.com>
|
||||
kuronekosaiko <EvanChanJ@163.com>
|
||||
kustaaya <58045274+kustaaya@users.noreply.github.com>
|
||||
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
||||
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
||||
l3utterfly <gc.pthzfoldr@gmail.com>
|
||||
laik <laik.lj@me.com>
|
||||
ldwang <ftgreat@163.com>
|
||||
le.chang <cljs118@126.com>
|
||||
leejet <leejet714@gmail.com>
|
||||
leo-pony <nengjunma@outlook.com>
|
||||
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
|
||||
lon <114724657+longregen@users.noreply.github.com>
|
||||
loonerin <132926317+loonerin@users.noreply.github.com>
|
||||
ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
|
||||
luoyu-intel <yu.luo@intel.com>
|
||||
m3ndax <adrian.goessl@outlook.com>
|
||||
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||
makomk <makosoft@googlemail.com>
|
||||
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||
maor-ps <154728172+maor-ps@users.noreply.github.com>
|
||||
matiaslin <45382001+matiaslin@users.noreply.github.com>
|
||||
matteo <matteogeniaccio@yahoo.it>
|
||||
mdrokz <mohammadmunshi@gmail.com>
|
||||
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||
minarchist <minarchist@users.noreply.github.com>
|
||||
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||
mmyjona <jonathan.gonse@gmail.com>
|
||||
momonga <115213907+mmnga@users.noreply.github.com>
|
||||
momonga <146910567+mmngays@users.noreply.github.com>
|
||||
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
||||
mzcu <milos.cubrilo@gmail.com>
|
||||
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
||||
@@ -716,8 +884,10 @@ omahs <73983677+omahs@users.noreply.github.com>
|
||||
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||
opparco <parco.opaai@gmail.com>
|
||||
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||
pculliton <phillipculliton@gmail.com>
|
||||
pengxin99 <pengxin.yuan@intel.com>
|
||||
perserk <perserk@gmail.com>
|
||||
piDack <104877312+piDack@users.noreply.github.com>
|
||||
pmysl <piotr.myslinski@outlook.com>
|
||||
postmasters <namnguyen@google.com>
|
||||
pudepiedj <pudepiedj@gmail.com>
|
||||
@@ -733,6 +903,7 @@ runfuture <runfuture@users.noreply.github.com>
|
||||
sandyiscool <sandyiscool@gmail.com>
|
||||
sasha0552 <admin@sasha0552.org>
|
||||
semidark <me@semidark.net>
|
||||
serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
|
||||
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||
shibe2 <shibe@tuta.io>
|
||||
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||
@@ -741,42 +912,55 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
|
||||
slaren <2141330+slaren@users.noreply.github.com>
|
||||
slaren <slarengh@gmail.com>
|
||||
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||
standby24x7 <standby24x7@gmail.com>
|
||||
staviq <staviq@gmail.com>
|
||||
stduhpf <stephduh@live.fr>
|
||||
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
|
||||
swittk <switt1995@gmail.com>
|
||||
takov751 <40316768+takov751@users.noreply.github.com>
|
||||
tarcey <cey.tarik@gmail.com>
|
||||
tc-mb <157115220+tc-mb@users.noreply.github.com>
|
||||
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||
thement <40525767+thement@users.noreply.github.com>
|
||||
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
|
||||
tjohnman <tjohnman@users.noreply.github.com>
|
||||
toyer <2042519524@qq.com>
|
||||
tslmy <tslmy@users.noreply.github.com>
|
||||
ubik2 <ubik2@users.noreply.github.com>
|
||||
uint256_t <konndennsa@gmail.com>
|
||||
uint256_t <maekawatoshiki1017@gmail.com>
|
||||
unbounded <haakon@likedan.net>
|
||||
uvos <devnull@uvos.xyz>
|
||||
valiray <133289098+valiray@users.noreply.github.com>
|
||||
vb <vaibhavs10@gmail.com>
|
||||
vik <vikhyatk@gmail.com>
|
||||
viric <viric@viric.name>
|
||||
vodkaslime <646329483@qq.com>
|
||||
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||
wangshuai09 <391746016@qq.com>
|
||||
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||
woachk <24752637+woachk@users.noreply.github.com>
|
||||
wonjun Jang <strutive07@gmail.com>
|
||||
woodx <124784234+woodx9@users.noreply.github.com>
|
||||
wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
|
||||
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||
xaedes <xaedes@gmail.com>
|
||||
xaedes <xaedes@googlemail.com>
|
||||
xctan <axunlei@gmail.com>
|
||||
xloem <0xloem@gmail.com>
|
||||
yangli2 <yangli2@gmail.com>
|
||||
yuiseki <yuiseki@gmail.com>
|
||||
yuri@FreeBSD <yurivict@users.noreply.github.com>
|
||||
zakkor <edward.partenie@gmail.com>
|
||||
zhangkaihuo <zhangkaihuo@gmail.com>
|
||||
zhentaoyu <zhentao.yu@intel.com>
|
||||
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||
zhouwg <zhouwg2000@gmail.com>
|
||||
zrm <trustiosity.zrm@gmail.com>
|
||||
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
|
||||
杨朱 · Kiki <baofa.fan@daocloud.io>
|
||||
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||
蕭澧邦 <45505768+shou692199@users.noreply.github.com>
|
||||
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
||||
|
||||
@@ -82,6 +82,7 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||
|
||||
# Required for relocatable CMake package
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
||||
|
||||
# override ggml options
|
||||
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
||||
|
||||
22
Makefile
22
Makefile
@@ -254,8 +254,8 @@ endif
|
||||
# keep standard at C11 and C++11
|
||||
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
|
||||
MK_CFLAGS = -std=c11 -fPIC
|
||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||
MK_NVCCFLAGS = -std=c++11
|
||||
MK_CXXFLAGS = -std=c++17 -fPIC
|
||||
MK_NVCCFLAGS = -std=c++17
|
||||
|
||||
ifdef LLAMA_NO_CCACHE
|
||||
GGML_NO_CCACHE := 1
|
||||
@@ -575,9 +575,12 @@ endif
|
||||
|
||||
ifndef GGML_NO_AMX
|
||||
MK_CPPFLAGS += -DGGML_USE_AMX
|
||||
OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
|
||||
OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
|
||||
endif
|
||||
|
||||
# only necessary for the CPU backend files
|
||||
MK_CPPFLAGS += -Iggml/src/ggml-cpu
|
||||
|
||||
ifdef GGML_RPC
|
||||
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||
OBJ_GGML_EXT += ggml/src/ggml-rpc.o
|
||||
@@ -752,7 +755,7 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||
|
||||
endif # GGML_VULKAN
|
||||
|
||||
ifdef GGML_HIPBLAS
|
||||
ifdef GGML_HIP
|
||||
ifeq ($(wildcard /opt/rocm),)
|
||||
ROCM_PATH ?= /usr
|
||||
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
||||
@@ -807,7 +810,7 @@ ggml/src/ggml-cuda/%.o: \
|
||||
ggml/src/ggml-common.h \
|
||||
ggml/src/ggml-cuda/common.cuh
|
||||
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||
endif # GGML_HIPBLAS
|
||||
endif # GGML_HIP
|
||||
|
||||
ifdef GGML_MUSA
|
||||
ifeq ($(wildcard /opt/musa),)
|
||||
@@ -815,7 +818,7 @@ ifdef GGML_MUSA
|
||||
else
|
||||
MUSA_PATH ?= /opt/musa
|
||||
endif
|
||||
MTGPU_TARGETS ?= mp_21 mp_22
|
||||
MUSA_ARCHITECTURES ?= 21;22
|
||||
|
||||
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
|
||||
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
|
||||
@@ -834,7 +837,8 @@ ifdef GGML_MUSA
|
||||
CXX := $(MUSA_PATH)/bin/clang++
|
||||
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
|
||||
|
||||
MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
|
||||
MUSAFLAGS = -x musa -mtgpu
|
||||
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
|
||||
|
||||
ifdef GGML_CUDA_FORCE_MMQ
|
||||
MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||
@@ -878,14 +882,14 @@ ggml/src/ggml-cuda/ggml-cuda.o: \
|
||||
ggml/src/ggml-backend-impl.h \
|
||||
ggml/src/ggml-common.h \
|
||||
$(wildcard ggml/src/ggml-cuda/*.cuh)
|
||||
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
|
||||
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
|
||||
|
||||
ggml/src/ggml-cuda/%.o: \
|
||||
ggml/src/ggml-cuda/%.cu \
|
||||
ggml/include/ggml.h \
|
||||
ggml/src/ggml-common.h \
|
||||
ggml/src/ggml-cuda/common.cuh
|
||||
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
|
||||
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
|
||||
endif # GGML_MUSA
|
||||
|
||||
ifdef GGML_METAL
|
||||
|
||||
@@ -28,13 +28,16 @@ var cSettings: [CSetting] = [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||
.unsafeFlags(["-fno-objc-arc"]),
|
||||
.headerSearchPath("ggml/src"),
|
||||
.headerSearchPath("ggml/src/ggml-cpu"),
|
||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||
// We should consider add this in the future when we drop support for iOS 14
|
||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||
// .define("ACCELERATE_NEW_LAPACK"),
|
||||
// .define("ACCELERATE_LAPACK_ILP64")
|
||||
.define("GGML_USE_CPU"),
|
||||
]
|
||||
|
||||
|
||||
#if canImport(Darwin)
|
||||
sources.append("ggml/src/ggml-common.h")
|
||||
sources.append("ggml/src/ggml-metal/ggml-metal.m")
|
||||
@@ -44,7 +47,6 @@ cSettings.append(
|
||||
contentsOf: [
|
||||
.define("GGML_USE_ACCELERATE"),
|
||||
.define("GGML_USE_METAL"),
|
||||
.define("GGML_USE_CPU")
|
||||
]
|
||||
)
|
||||
#endif
|
||||
|
||||
59
README.md
59
README.md
@@ -79,6 +79,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
- [x] [OLMo](https://allenai.org/olmo)
|
||||
- [x] [OLMo 2](https://allenai.org/olmo)
|
||||
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
|
||||
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
|
||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||
@@ -139,44 +140,36 @@ Typically finetunes of the base models below are supported as well.
|
||||
|
||||
**UI:**
|
||||
|
||||
Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
|
||||
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
|
||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
- [Faraday](https://faraday.dev/) (proprietary)
|
||||
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
|
||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||
- [LARS](https://github.com/abgulati/LARS) (AGPL)
|
||||
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
|
||||
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
||||
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
|
||||
- [ollama/ollama](https://github.com/ollama/ollama)
|
||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||
- [RAGNA Desktop](https://ragna.app/) (proprietary)
|
||||
- [RecurseChat](https://recurse.chat/) (proprietary)
|
||||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||
- [Msty](https://msty.app) (proprietary)
|
||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
|
||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||
- [MindMac](https://mindmac.app) (proprietary)
|
||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
||||
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
||||
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
|
||||
- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
|
||||
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
|
||||
- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
|
||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
||||
- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
|
||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
|
||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
||||
- [semperai/amica](https://github.com/semperai/amica) (MIT)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
|
||||
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
|
||||
33
cmake/common.cmake
Normal file
33
cmake/common.cmake
Normal file
@@ -0,0 +1,33 @@
|
||||
function(llama_add_compile_flags)
|
||||
if (LLAMA_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND C_FLAGS -Werror)
|
||||
list(APPEND CXX_FLAGS -Werror)
|
||||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
add_compile_options(/WX)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS)
|
||||
if (NOT MSVC)
|
||||
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
||||
|
||||
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
||||
|
||||
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||
|
||||
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
||||
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
||||
|
||||
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
||||
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
|
||||
else()
|
||||
# todo : msvc
|
||||
set(C_FLAGS "" PARENT_SCOPE)
|
||||
set(CXX_FLAGS "" PARENT_SCOPE)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
llama_add_compile_flags()
|
||||
|
||||
# Build info header
|
||||
#
|
||||
|
||||
@@ -86,5 +88,5 @@ if (LLAMA_CURL)
|
||||
endif ()
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
|
||||
@@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
|
||||
}
|
||||
params.hf_file = params.model;
|
||||
} else if (params.model.empty()) {
|
||||
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
|
||||
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||
std::string filename = params.hf_repo + "_" + params.hf_file;
|
||||
// to make sure we don't have any slashes in the filename
|
||||
string_replace_all(filename, "/", "_");
|
||||
params.model = fs_get_cache_file(filename);
|
||||
}
|
||||
} else if (!params.model_url.empty()) {
|
||||
if (params.model.empty()) {
|
||||
@@ -1366,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, int value) {
|
||||
params.n_gpu_layers = value;
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
||||
@@ -2100,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_gpu_layers = value;
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
|
||||
@@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
|
||||
|
||||
std::u32string filename_utf32;
|
||||
try {
|
||||
#if defined(__clang__)
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#endif
|
||||
|
||||
filename_utf32 = converter.from_bytes(filename);
|
||||
|
||||
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
||||
@@ -829,9 +839,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
llama_model * model = nullptr;
|
||||
|
||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
||||
} else if (!params.model_url.empty()) {
|
||||
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||
} else {
|
||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
@@ -1342,17 +1352,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const char * model_url,
|
||||
const char * path_model,
|
||||
const char * hf_token,
|
||||
const std::string & model_url,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params) {
|
||||
// Basic validation of the model_url
|
||||
if (!model_url || strlen(model_url) == 0) {
|
||||
if (model_url.empty()) {
|
||||
LOG_ERR("%s: invalid model_url\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!common_download_file(model_url, path_model, hf_token)) {
|
||||
if (!common_download_file(model_url, local_path, hf_token)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -1363,9 +1373,9 @@ struct llama_model * common_load_model_from_url(
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ NULL,
|
||||
};
|
||||
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
||||
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
||||
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -1384,13 +1394,13 @@ struct llama_model * common_load_model_from_url(
|
||||
// Verify the first split file format
|
||||
// and extract split URL and PATH prefixes
|
||||
{
|
||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@@ -1417,14 +1427,14 @@ struct llama_model * common_load_model_from_url(
|
||||
}
|
||||
}
|
||||
|
||||
return llama_load_model_from_file(path_model, params);
|
||||
return llama_load_model_from_file(local_path.c_str(), params);
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const char * repo,
|
||||
const char * model,
|
||||
const char * path_model,
|
||||
const char * hf_token,
|
||||
const std::string & repo,
|
||||
const std::string & remote_path,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params) {
|
||||
// construct hugging face model url:
|
||||
//
|
||||
@@ -1438,27 +1448,27 @@ struct llama_model * common_load_model_from_hf(
|
||||
std::string model_url = "https://huggingface.co/";
|
||||
model_url += repo;
|
||||
model_url += "/resolve/main/";
|
||||
model_url += model;
|
||||
model_url += remote_path;
|
||||
|
||||
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
||||
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const char * /*model_url*/,
|
||||
const char * /*path_model*/,
|
||||
const char * /*hf_token*/,
|
||||
const std::string & /*model_url*/,
|
||||
const std::string & /*local_path*/,
|
||||
const std::string & /*hf_token*/,
|
||||
const struct llama_model_params & /*params*/) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const char * /*repo*/,
|
||||
const char * /*model*/,
|
||||
const char * /*path_model*/,
|
||||
const char * /*hf_token*/,
|
||||
const std::string & /*repo*/,
|
||||
const std::string & /*remote_path*/,
|
||||
const std::string & /*local_path*/,
|
||||
const std::string & /*hf_token*/,
|
||||
const struct llama_model_params & /*params*/) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||
return nullptr;
|
||||
|
||||
@@ -470,8 +470,17 @@ struct llama_model_params common_model_params_to_llama ( common_params
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||
|
||||
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const std::string & model_url,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const std::string & repo,
|
||||
const std::string & remote_path,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||
|
||||
@@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
|
||||
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
|
||||
|
||||
```
|
||||
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
||||
$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
||||
```
|
||||
|
||||
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
||||
Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
||||
|
||||
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
|
||||
|
||||
|
||||
@@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
|
||||
## News
|
||||
|
||||
- 2024.11
|
||||
- Support F16 and F32 data type model for Ascend 310P NPU.
|
||||
- 2024.8
|
||||
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
|
||||
- 2024.7
|
||||
@@ -40,9 +42,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
### Ascend NPU
|
||||
|
||||
**Verified devices**
|
||||
|
||||
| Ascend NPU | Status |
|
||||
|:-----------------------------:|:-------:|
|
||||
| Atlas 300T A2 | Support |
|
||||
| Atlas 300I Duo | Support |
|
||||
|
||||
*Notes:*
|
||||
|
||||
|
||||
@@ -221,7 +221,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
||||
|
||||
- Using `make`:
|
||||
```bash
|
||||
make GGML_HIPBLAS=1
|
||||
make GGML_HIP=1
|
||||
```
|
||||
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
||||
```bash
|
||||
@@ -249,7 +249,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
||||
|
||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||
```bash
|
||||
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||
make -j16 GGML_HIP=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||
```
|
||||
|
||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||
|
||||
@@ -6,6 +6,10 @@ find_package(Threads REQUIRED)
|
||||
|
||||
# ...
|
||||
|
||||
# flags
|
||||
|
||||
llama_add_compile_flags()
|
||||
|
||||
# examples
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-batched-bench)
|
||||
add_executable(${TARGET} batched-bench.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-batched)
|
||||
add_executable(${TARGET} batched.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml)
|
||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator)
|
||||
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-embedding)
|
||||
add_executable(${TARGET} embedding.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,7 +2,7 @@ set(TARGET llama-eval-callback)
|
||||
add_executable(${TARGET} eval-callback.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TEST_TARGET test-eval-callback)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-export-lora)
|
||||
add_executable(${TARGET} export-lora.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator)
|
||||
add_executable(${TARGET} gbnf-validator.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-gen-docs)
|
||||
add_executable(${TARGET} gen-docs.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -4,12 +4,19 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||
|
||||
# clibs dependencies
|
||||
include_directories(deps/)
|
||||
|
||||
add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
|
||||
target_link_libraries(${TARGET} PRIVATE xxhash)
|
||||
|
||||
add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
|
||||
target_link_libraries(${TARGET} PRIVATE sha1)
|
||||
if (NOT MSVC)
|
||||
# disable warnings in 3rd party code
|
||||
target_compile_options(sha1 PRIVATE -w)
|
||||
endif()
|
||||
|
||||
add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
|
||||
target_link_libraries(${TARGET} PRIVATE sha256)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-gguf-split)
|
||||
add_executable(${TARGET} gguf-split.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-gguf)
|
||||
add_executable(${TARGET} gguf.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-gritlm)
|
||||
add_executable(${TARGET} gritlm.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-imatrix)
|
||||
add_executable(${TARGET} imatrix.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -637,10 +637,19 @@ int main(int argc, char ** argv) {
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
if (!compute_imatrix(ctx, params)) {
|
||||
return 1;
|
||||
if (params.prompt.empty()) {
|
||||
if (params.in_files.empty()) {
|
||||
LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
|
||||
return 1;
|
||||
}
|
||||
LOG_INF("No prompt provided; combining precomputed matrices only.\n");
|
||||
} else {
|
||||
if (!compute_imatrix(ctx, params)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
g_collector.save_imatrix();
|
||||
|
||||
LOG("\n");
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-infill)
|
||||
add_executable(${TARGET} infill.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-bench)
|
||||
add_executable(${TARGET} llama-bench.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .)
|
||||
target_include_directories(llava PUBLIC ../..)
|
||||
target_include_directories(llava PUBLIC ../../common)
|
||||
|
||||
target_compile_features(llava PRIVATE cxx_std_11)
|
||||
target_compile_features(llava PRIVATE cxx_std_17)
|
||||
|
||||
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
@@ -35,11 +35,11 @@ add_executable(${TARGET} llava-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-minicpmv-cli)
|
||||
add_executable(${TARGET} minicpmv-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -40,10 +40,17 @@
|
||||
#include <cinttypes>
|
||||
#include <limits>
|
||||
|
||||
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#if defined(LLAVA_LOG_OFF)
|
||||
# define LOG_INF(...)
|
||||
# define LOG_WRN(...)
|
||||
# define LOG_ERR(...)
|
||||
# define LOG_DBG(...)
|
||||
#else // defined(LLAVA_LOG_OFF)
|
||||
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
#endif // defined(LLAVA_LOG_OFF)
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
|
||||
|
||||
@@ -11,13 +11,17 @@
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
#if defined(LLAVA_LOG_OFF)
|
||||
# define LOG_INF(...)
|
||||
# define LOG_WRN(...)
|
||||
# define LOG_ERR(...)
|
||||
# define LOG_DBG(...)
|
||||
#else // defined(LLAVA_LOG_OFF)
|
||||
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
#endif // defined(LLAVA_LOG_OFF)
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
@@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
|
||||
errno = 0;
|
||||
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
||||
if (ferror(file)) {
|
||||
die_fmt("read error: %s", strerror(errno));
|
||||
LOG_ERR("read error: %s", strerror(errno));
|
||||
free(buffer);
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
if (ret != (size_t) fileSize) {
|
||||
die("unexpectedly reached end of file");
|
||||
LOG_ERR("unexpectedly reached end of file");
|
||||
free(buffer);
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
fclose(file); // Close the file
|
||||
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-lookahead)
|
||||
add_executable(${TARGET} lookahead.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,22 +2,22 @@ set(TARGET llama-lookup)
|
||||
add_executable(${TARGET} lookup.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-create)
|
||||
add_executable(${TARGET} lookup-create.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-merge)
|
||||
add_executable(${TARGET} lookup-merge.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-stats)
|
||||
add_executable(${TARGET} lookup-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
|
||||
target_include_directories(${TARGET} PRIVATE ${_common_path})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-cli)
|
||||
add_executable(${TARGET} main.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-parallel)
|
||||
add_executable(${TARGET} parallel.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-passkey)
|
||||
add_executable(${TARGET} passkey.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-perplexity)
|
||||
add_executable(${TARGET} perplexity.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-retrieval)
|
||||
add_executable(${TARGET} retrieval.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-run)
|
||||
add_executable(${TARGET} run.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -50,4 +50,4 @@ if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2267,49 +2267,48 @@ struct server_context {
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
llama_token id;
|
||||
llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
||||
|
||||
{
|
||||
completion_token_output result;
|
||||
slot.i_batch = -1;
|
||||
|
||||
id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
||||
common_sampler_accept(slot.smpl, id, true);
|
||||
|
||||
slot.i_batch = -1;
|
||||
|
||||
common_sampler_accept(slot.smpl, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1) {
|
||||
slot.t_start_generation = ggml_time_us();
|
||||
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
result.tok = id;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
|
||||
});
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
// release slot because of stop condition
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
continue;
|
||||
}
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1) {
|
||||
slot.t_start_generation = ggml_time_us();
|
||||
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
// check if the slot supports speculative decoding
|
||||
if (!slot.can_speculate()) {
|
||||
completion_token_output result;
|
||||
result.tok = id;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
|
||||
});
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
// release slot because of stop condition
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// do speculative decoding
|
||||
for (auto & slot : slots) {
|
||||
if (!slot.is_processing() || !slot.can_speculate()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
llama_token id = slot.sampled;
|
||||
|
||||
struct common_speculative_params params_spec;
|
||||
params_spec.n_draft = slot.params.speculative.n_max;
|
||||
|
||||
1
examples/server/tests/.gitignore
vendored
1
examples/server/tests/.gitignore
vendored
@@ -1 +1,2 @@
|
||||
.venv
|
||||
tmp
|
||||
|
||||
@@ -1,19 +1,9 @@
|
||||
# Server tests
|
||||
|
||||
Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development)
|
||||
and [behave](https://behave.readthedocs.io/en/latest/):
|
||||
|
||||
* [issues.feature](./features/issues.feature) Pending issues scenario
|
||||
* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
|
||||
* [security.feature](./features/security.feature) Security, CORS and API Key
|
||||
* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
|
||||
Python based server tests scenario using [pytest](https://docs.pytest.org/en/stable/).
|
||||
|
||||
Tests target GitHub workflows job runners with 4 vCPU.
|
||||
|
||||
Requests are
|
||||
using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
|
||||
based http client.
|
||||
|
||||
Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
|
||||
To mitigate it, you can increase values in `n_predict`, `kv_size`.
|
||||
|
||||
@@ -39,26 +29,19 @@ It's possible to override some scenario steps values with environment variables:
|
||||
|--------------------------|------------------------------------------------------------------------------------------------|
|
||||
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
||||
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
|
||||
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
||||
| `DEBUG` | to enable steps and server verbose mode `--verbose` |
|
||||
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
||||
|
||||
### Run @bug, @wip or @wrong_usage annotated scenario
|
||||
|
||||
Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
|
||||
|
||||
- `@bug` annotation aims to link a scenario with a GitHub issue.
|
||||
- `@wrong_usage` are meant to show user issue that are actually an expected behavior
|
||||
- `@wip` to focus on a scenario working in progress
|
||||
- `@slow` heavy test, disabled by default
|
||||
|
||||
To run a scenario annotated with `@bug`, start:
|
||||
To run slow tests:
|
||||
|
||||
```shell
|
||||
DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
|
||||
SLOW_TESTS=1 ./tests.sh
|
||||
```
|
||||
|
||||
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
|
||||
To run with stdout/stderr display in real time (verbose output, but useful for debugging):
|
||||
|
||||
```shell
|
||||
./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile"
|
||||
DEBUG=1 ./tests.sh -s -v -x
|
||||
```
|
||||
|
||||
To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
|
||||
|
||||
15
examples/server/tests/conftest.py
Normal file
15
examples/server/tests/conftest.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
|
||||
# ref: https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test
|
||||
@pytest.fixture(autouse=True)
|
||||
def stop_server_after_each_test():
|
||||
# do nothing before each test
|
||||
yield
|
||||
# stop all servers after each test
|
||||
instances = set(
|
||||
server_instances
|
||||
) # copy the set to prevent 'Set changed size during iteration'
|
||||
for server in instances:
|
||||
server.stop()
|
||||
@@ -1,66 +0,0 @@
|
||||
@llama.cpp
|
||||
@ctx_shift
|
||||
Feature: llama.cpp server
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||
And a model file test-model.gguf
|
||||
And a model alias tinyllama-2
|
||||
And BOS token is 1
|
||||
And 42 as server seed
|
||||
And 256 KV cache size
|
||||
And 32 as batch size
|
||||
And 2 slots
|
||||
|
||||
# the prompt is 301 tokens
|
||||
# the slot context is 256/2 = 128 tokens
|
||||
# the prompt is truncated to keep the last 109 tokens
|
||||
# 64 tokens are generated thanks to shifting the context when it gets full
|
||||
Scenario: Inference with context shift
|
||||
And 64 server max tokens to predict
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
Given a prompt:
|
||||
"""
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
"""
|
||||
And a completion request with no api error
|
||||
Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
|
||||
And the completion is truncated
|
||||
And 109 prompt tokens are processed
|
||||
|
||||
Scenario Outline: Inference without context shift
|
||||
And <n_predict> server max tokens to predict
|
||||
And disable context shifting
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
Given a prompt:
|
||||
"""
|
||||
Hi how are you
|
||||
"""
|
||||
And a completion request with no api error
|
||||
Then <n_token_output> tokens are predicted matching twind|Anna
|
||||
And the completion is <truncated> truncated
|
||||
And 8 prompt tokens are processed
|
||||
Examples:
|
||||
| n_predict | n_token_output | truncated |
|
||||
| 64 | 64 | not |
|
||||
| -1 | 120 | |
|
||||
|
||||
Scenario: Inference without context shift (expected error: prompt too long)
|
||||
And disable context shifting
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
Given a prompt:
|
||||
"""
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
"""
|
||||
And a completion request with 400 api error
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
@llama.cpp
|
||||
@embeddings
|
||||
Feature: llama.cpp server
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
|
||||
And a model file bert-bge-small.gguf
|
||||
And a model alias bert-bge-small
|
||||
And 42 as server seed
|
||||
And 2 slots
|
||||
# the bert-bge-small model has context size of 512
|
||||
# since the generated prompts are as big as the batch size, we need to set the batch size to <= 512
|
||||
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
|
||||
And 128 as batch size
|
||||
And 128 as ubatch size
|
||||
And 512 KV cache size
|
||||
And enable embeddings endpoint
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario: Embedding
|
||||
When embeddings are computed for:
|
||||
"""
|
||||
What is the capital of Bulgaria ?
|
||||
"""
|
||||
Then embeddings are generated
|
||||
|
||||
Scenario: Embedding (error: prompt too long)
|
||||
When embeddings are computed for:
|
||||
"""
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
"""
|
||||
And embeddings request with 500 api error
|
||||
|
||||
Scenario: OAI Embeddings compatibility
|
||||
Given a model bert-bge-small
|
||||
When an OAI compatible embeddings computation request for:
|
||||
"""
|
||||
What is the capital of Spain ?
|
||||
"""
|
||||
Then embeddings are generated
|
||||
|
||||
Scenario: OAI Embeddings compatibility with multiple inputs
|
||||
Given a model bert-bge-small
|
||||
Given a prompt:
|
||||
"""
|
||||
In which country Paris is located ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Is Madrid the capital of Spain ?
|
||||
"""
|
||||
When an OAI compatible embeddings computation request for multiple inputs
|
||||
Then embeddings are generated
|
||||
|
||||
Scenario: Multi users embeddings
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another very long music lyrics.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long poem.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long joke.
|
||||
"""
|
||||
Given concurrent embedding requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all embeddings are generated
|
||||
|
||||
Scenario: Multi users OAI compatibility embeddings
|
||||
Given a prompt:
|
||||
"""
|
||||
In which country Paris is located ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Is Madrid the capital of Spain ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
What is the biggest US city ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
What is the capital of Bulgaria ?
|
||||
"""
|
||||
And a model bert-bge-small
|
||||
Given concurrent OAI embedding requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all embeddings are generated
|
||||
|
||||
Scenario: All embeddings should be the same
|
||||
Given 10 fixed prompts
|
||||
And a model bert-bge-small
|
||||
Given concurrent OAI embedding requests
|
||||
Then all embeddings are the same
|
||||
@@ -1,71 +0,0 @@
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from contextlib import closing
|
||||
from subprocess import TimeoutExpired
|
||||
|
||||
|
||||
def before_scenario(context, scenario):
|
||||
context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
|
||||
if context.debug:
|
||||
print("DEBUG=ON")
|
||||
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
|
||||
port = 8080
|
||||
if 'PORT' in os.environ:
|
||||
port = int(os.environ['PORT'])
|
||||
if is_server_listening("localhost", port):
|
||||
assert False, "Server already started"
|
||||
|
||||
|
||||
def after_scenario(context, scenario):
|
||||
try:
|
||||
if 'server_process' not in context or context.server_process is None:
|
||||
return
|
||||
if scenario.status == "failed":
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n")
|
||||
if os.path.isfile('llama.log'):
|
||||
with closing(open('llama.log', 'r')) as f:
|
||||
for line in f:
|
||||
print(line)
|
||||
if not is_server_listening(context.server_fqdn, context.server_port):
|
||||
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
|
||||
|
||||
if context.server_process.poll() is not None:
|
||||
assert False, f"Server not running pid={context.server_process.pid} ..."
|
||||
|
||||
server_graceful_shutdown(context) # SIGINT
|
||||
|
||||
try:
|
||||
context.server_process.wait(0.5)
|
||||
except TimeoutExpired:
|
||||
print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...")
|
||||
context.server_process.kill() # SIGKILL
|
||||
context.server_process.wait()
|
||||
|
||||
while is_server_listening(context.server_fqdn, context.server_port):
|
||||
time.sleep(0.1)
|
||||
except Exception:
|
||||
print("ignoring error in after_scenario:")
|
||||
traceback.print_exc(file=sys.stdout)
|
||||
|
||||
|
||||
def server_graceful_shutdown(context):
|
||||
print(f"shutting down server pid={context.server_process.pid} ...")
|
||||
if os.name == 'nt':
|
||||
interrupt = signal.CTRL_C_EVENT
|
||||
else:
|
||||
interrupt = signal.SIGINT
|
||||
context.server_process.send_signal(interrupt)
|
||||
|
||||
|
||||
def is_server_listening(server_fqdn, server_port):
|
||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||
result = sock.connect_ex((server_fqdn, server_port))
|
||||
_is_server_listening = result == 0
|
||||
if _is_server_listening:
|
||||
print(f"server is listening on {server_fqdn}:{server_port}...")
|
||||
return _is_server_listening
|
||||
@@ -1,36 +0,0 @@
|
||||
@llama.cpp
|
||||
@infill
|
||||
Feature: llama.cpp server
|
||||
|
||||
# The current model is made by adding FIM tokens to the existing stories260K
|
||||
# We may want to use a better model in the future, maybe something like SmolLM 360M
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models
|
||||
And a model file test-model-infill.gguf
|
||||
And a model alias tinyllama-infill
|
||||
And 42 as server seed
|
||||
And 1024 as batch size
|
||||
And 1024 as ubatch size
|
||||
And 2048 KV cache size
|
||||
And 64 max tokens to predict
|
||||
And 0.0 temperature
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario: Infill without input_extra
|
||||
Given a prompt "Complete this"
|
||||
And an infill input extra none none
|
||||
And an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_"
|
||||
And an infill input suffix "}\n"
|
||||
And an infill request with no api error
|
||||
Then 64 tokens are predicted matching One|day|she|saw|big|scary|bird
|
||||
|
||||
Scenario: Infill with input_extra
|
||||
Given a prompt "Complete this"
|
||||
And an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n"
|
||||
And an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_"
|
||||
And an infill input suffix "}\n"
|
||||
And an infill request with no api error
|
||||
Then 64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room"
|
||||
@@ -1,5 +0,0 @@
|
||||
# List of ongoing issues
|
||||
# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
|
||||
@bug
|
||||
Feature: Issues
|
||||
# No confirmed issue at the moment
|
||||
@@ -1,36 +0,0 @@
|
||||
@llama.cpp
|
||||
@lora
|
||||
Feature: llama.cpp server
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf
|
||||
And a model file stories15M_MOE-F16.gguf
|
||||
And a model alias stories15M_MOE
|
||||
And a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf
|
||||
And 42 as server seed
|
||||
And 1024 as batch size
|
||||
And 1024 as ubatch size
|
||||
And 2048 KV cache size
|
||||
And 64 max tokens to predict
|
||||
And 0.0 temperature
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario: Completion LoRA disabled
|
||||
Given switch off lora adapter 0
|
||||
Given a prompt:
|
||||
"""
|
||||
Look in thy glass
|
||||
"""
|
||||
And a completion request with no api error
|
||||
Then 64 tokens are predicted matching little|girl|three|years|old
|
||||
|
||||
Scenario: Completion LoRA enabled
|
||||
Given switch on lora adapter 0
|
||||
Given a prompt:
|
||||
"""
|
||||
Look in thy glass
|
||||
"""
|
||||
And a completion request with no api error
|
||||
Then 64 tokens are predicted matching eye|love|glass|sun
|
||||
@@ -1,131 +0,0 @@
|
||||
@llama.cpp
|
||||
@parallel
|
||||
Feature: Parallel
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
|
||||
And a model file test-model-00001-of-00003.gguf
|
||||
And 42 as server seed
|
||||
And 128 as batch size
|
||||
And 256 KV cache size
|
||||
And 2 slots
|
||||
And continuous batching
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario Outline: Multi users completion
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another very long music lyrics.
|
||||
"""
|
||||
And <n_predict> max tokens to predict
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
Then all prompts are predicted with <n_predict> tokens
|
||||
Examples:
|
||||
| n_predict |
|
||||
| 128 |
|
||||
|
||||
Scenario Outline: Multi users OAI completions compatibility
|
||||
Given a system prompt You are a writer.
|
||||
And a model tinyllama-2
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long book.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another a poem.
|
||||
"""
|
||||
And <n_predict> max tokens to predict
|
||||
And streaming is <streaming>
|
||||
Given concurrent OAI completions requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted with <n_predict> tokens
|
||||
Examples:
|
||||
| streaming | n_predict |
|
||||
| disabled | 128 |
|
||||
| enabled | 64 |
|
||||
|
||||
Scenario Outline: Multi users OAI completions compatibility no v1
|
||||
Given a system prompt You are a writer.
|
||||
And a model tinyllama-2
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long book.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another a poem.
|
||||
"""
|
||||
And <n_predict> max tokens to predict
|
||||
And streaming is <streaming>
|
||||
Given concurrent OAI completions requests no v1
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted with <n_predict> tokens
|
||||
Examples:
|
||||
| streaming | n_predict |
|
||||
| disabled | 128 |
|
||||
| enabled | 64 |
|
||||
|
||||
Scenario Outline: Multi users with number of prompts exceeding number of slots
|
||||
Given a system prompt You are a writer.
|
||||
And a model tinyllama-2
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long book.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another a poem.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
What is LLM?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
The sky is blue and I love it.
|
||||
"""
|
||||
And <n_predict> max tokens to predict
|
||||
And streaming is <streaming>
|
||||
Given concurrent OAI completions requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted with <n_predict> tokens
|
||||
Examples:
|
||||
| streaming | n_predict |
|
||||
| disabled | 128 |
|
||||
| enabled | 64 |
|
||||
|
||||
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another very long music lyrics.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long poem.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long joke.
|
||||
"""
|
||||
And 128 max tokens to predict
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted
|
||||
@@ -1,56 +0,0 @@
|
||||
# run with: ./tests.sh --no-skipped --tags passkey
|
||||
@passkey
|
||||
@slow
|
||||
Feature: Passkey / Self-extend with context shift
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
|
||||
# Generates a long text of junk and inserts a secret passkey number inside it.
|
||||
# Then we query the LLM for the secret passkey.
|
||||
# see #3856 and #4810
|
||||
Scenario Outline: Passkey
|
||||
Given a model file <hf_file> from HF repo <hf_repo>
|
||||
And <n_batch> as batch size
|
||||
And <n_junk> as number of junk
|
||||
And <n_predicted> server max tokens to predict
|
||||
And 42 as seed
|
||||
And 0.0 temperature
|
||||
And <n_ctx> KV cache size
|
||||
And 1 slots
|
||||
And <n_ga> group attention factor to extend context size through self-extend
|
||||
And <n_ga_w> group attention width to extend context size through self-extend
|
||||
# Can be override with N_GPU_LAYERS
|
||||
And <ngl> GPU offloaded layers
|
||||
Then the server is starting
|
||||
# Higher timeout because the model may need to be downloaded from the internet
|
||||
Then the server is healthy with timeout 120 seconds
|
||||
Given available models
|
||||
Then model 0 is trained on <n_ctx_train> tokens context
|
||||
Given a prefix prompt:
|
||||
"""
|
||||
here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
|
||||
"""
|
||||
And a passkey prompt template:
|
||||
"""
|
||||
The pass key is <passkey> Remember it. <passkey> is the pass key.
|
||||
"""
|
||||
And a junk suffix prompt:
|
||||
"""
|
||||
The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
|
||||
"""
|
||||
And a suffix prompt:
|
||||
"""
|
||||
What is the pass key? The pass key is
|
||||
"""
|
||||
Given a "<passkey>" passkey challenge prompt with the passkey inserted every <i_pos> junk
|
||||
And a completion request with no api error
|
||||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
|
||||
Examples:
|
||||
| hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content |
|
||||
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 4 | 512 | 250 | 50 | 42 | 1 | 42 |
|
||||
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 2 | 512 | 250 | 50 | 42 | 1 | \b((?!42)\w)+\b |
|
||||
#| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 |
|
||||
#| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0
|
||||
# 987 |
|
||||
@@ -1,42 +0,0 @@
|
||||
@llama.cpp
|
||||
@rerank
|
||||
Feature: llama.cpp server
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model url https://huggingface.co/ggml-org/models/resolve/main/jina-reranker-v1-tiny-en/ggml-model-f16.gguf
|
||||
And a model file jina-reranker-v1-tiny-en.gguf
|
||||
And a model alias jina-reranker-v1-tiny-en
|
||||
And 42 as server seed
|
||||
And 2 slots
|
||||
And 512 as batch size
|
||||
And 512 as ubatch size
|
||||
And 512 KV cache size
|
||||
And enable reranking endpoint
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario: Rerank
|
||||
Given a rerank query:
|
||||
"""
|
||||
Machine learning is
|
||||
"""
|
||||
And a rerank document:
|
||||
"""
|
||||
A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.
|
||||
"""
|
||||
And a rerank document:
|
||||
"""
|
||||
Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.
|
||||
"""
|
||||
And a rerank document:
|
||||
"""
|
||||
Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.
|
||||
"""
|
||||
And a rerank document:
|
||||
"""
|
||||
Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine.
|
||||
"""
|
||||
When reranking request
|
||||
Then reranking results are returned
|
||||
Then reranking highest score is index 2 and lowest score is index 3
|
||||
@@ -1,118 +0,0 @@
|
||||
@llama.cpp
|
||||
@results
|
||||
Feature: Results
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
|
||||
And a model file test-model-00001-of-00003.gguf
|
||||
And 128 as batch size
|
||||
And 1024 KV cache size
|
||||
And 128 max tokens to predict
|
||||
And continuous batching
|
||||
|
||||
Scenario Outline: consistent results with same seed
|
||||
Given <n_slots> slots
|
||||
And 1.0 temperature
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
|
||||
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
Then all predictions are equal
|
||||
Examples:
|
||||
| n_slots |
|
||||
| 1 |
|
||||
# FIXME: unified KV cache nondeterminism
|
||||
# | 2 |
|
||||
|
||||
Scenario Outline: different results with different seed
|
||||
Given <n_slots> slots
|
||||
And 1.0 temperature
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
|
||||
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
|
||||
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
|
||||
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
|
||||
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
Then all predictions are different
|
||||
Examples:
|
||||
| n_slots |
|
||||
| 1 |
|
||||
| 2 |
|
||||
|
||||
Scenario Outline: consistent results with same seed and varying batch size
|
||||
Given 4 slots
|
||||
And <temp> temperature
|
||||
# And 0 as draft
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given 1 prompts "Write a very long story about AI." with seed 42
|
||||
And concurrent completion requests
|
||||
# Then the server is busy # Not all slots will be utilized.
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
|
||||
Given <n_parallel> prompts "Write a very long story about AI." with seed 42
|
||||
And concurrent completion requests
|
||||
# Then the server is busy # Not all slots will be utilized.
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
|
||||
Then all predictions are equal
|
||||
Examples:
|
||||
| n_parallel | temp |
|
||||
| 1 | 0.0 |
|
||||
| 1 | 1.0 |
|
||||
# FIXME: unified KV cache nondeterminism
|
||||
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/7347 .
|
||||
# | 2 | 0.0 |
|
||||
# | 4 | 0.0 |
|
||||
# | 2 | 1.0 |
|
||||
# | 4 | 1.0 |
|
||||
|
||||
Scenario Outline: consistent token probs with same seed and prompt
|
||||
Given <n_slots> slots
|
||||
And <n_kv> KV cache size
|
||||
And 1.0 temperature
|
||||
And <n_predict> max tokens to predict
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given 1 prompts "The meaning of life is" with seed 42
|
||||
And concurrent completion requests
|
||||
# Then the server is busy # Not all slots will be utilized.
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
|
||||
Given <n_parallel> prompts "The meaning of life is" with seed 42
|
||||
And concurrent completion requests
|
||||
# Then the server is busy # Not all slots will be utilized.
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
|
||||
Then all token probabilities are equal
|
||||
Examples:
|
||||
| n_slots | n_kv | n_predict | n_parallel |
|
||||
| 4 | 1024 | 1 | 1 |
|
||||
# FIXME: unified KV cache nondeterminism
|
||||
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/7347 .
|
||||
# | 4 | 1024 | 1 | 4 |
|
||||
# | 4 | 1024 | 100 | 1 |
|
||||
# This test still fails even the above patches; the first token probabilities are already different.
|
||||
# | 4 | 1024 | 100 | 4 |
|
||||
@@ -1,68 +0,0 @@
|
||||
@llama.cpp
|
||||
@security
|
||||
Feature: Security
|
||||
|
||||
Background: Server startup with an api key defined
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||
And a server api key THIS_IS_THE_KEY
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario Outline: Completion with some user api key
|
||||
Given a prompt test
|
||||
And a user api key <api_key>
|
||||
And 4 max tokens to predict
|
||||
And a completion request with <api_error> api error
|
||||
|
||||
Examples: Prompts
|
||||
| api_key | api_error |
|
||||
| THIS_IS_THE_KEY | no |
|
||||
| THIS_IS_THE_KEY | no |
|
||||
| hackeme | raised |
|
||||
| | raised |
|
||||
|
||||
Scenario Outline: OAI Compatibility
|
||||
Given a system prompt test
|
||||
And a user prompt test
|
||||
And a model test
|
||||
And 2 max tokens to predict
|
||||
And streaming is disabled
|
||||
And a user api key <api_key>
|
||||
Given an OAI compatible chat completions request with <api_error> api error
|
||||
|
||||
Examples: Prompts
|
||||
| api_key | api_error |
|
||||
| THIS_IS_THE_KEY | no |
|
||||
| THIS_IS_THE_KEY | no |
|
||||
| hackme | raised |
|
||||
|
||||
Scenario Outline: OAI Compatibility (invalid response formats)
|
||||
Given a system prompt test
|
||||
And a user prompt test
|
||||
And a response format <response_format>
|
||||
And a model test
|
||||
And 2 max tokens to predict
|
||||
And streaming is disabled
|
||||
Given an OAI compatible chat completions request with raised api error
|
||||
|
||||
Examples: Prompts
|
||||
| response_format |
|
||||
| {"type": "sound"} |
|
||||
| {"type": "json_object", "schema": 123} |
|
||||
| {"type": "json_object", "schema": {"type": 123}} |
|
||||
| {"type": "json_object", "schema": {"type": "hiccup"}} |
|
||||
|
||||
|
||||
Scenario Outline: CORS Options
|
||||
Given a user api key THIS_IS_THE_KEY
|
||||
When an OPTIONS request is sent from <origin>
|
||||
Then CORS header <cors_header> is set to <cors_header_value>
|
||||
|
||||
Examples: Headers
|
||||
| origin | cors_header | cors_header_value |
|
||||
| localhost | Access-Control-Allow-Origin | localhost |
|
||||
| web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr |
|
||||
| origin | Access-Control-Allow-Credentials | true |
|
||||
| web.mydomain.fr | Access-Control-Allow-Methods | GET, POST |
|
||||
| web.mydomain.fr | Access-Control-Allow-Headers | * |
|
||||
@@ -1,120 +0,0 @@
|
||||
@llama.cpp
|
||||
@server
|
||||
Feature: llama.cpp server
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||
And a model file test-model.gguf
|
||||
And a model alias tinyllama-2
|
||||
And BOS token is 1
|
||||
And 42 as server seed
|
||||
# KV Cache corresponds to the total amount of tokens
|
||||
# that can be stored across all independent sequences: #4130
|
||||
# see --ctx-size and #5568
|
||||
And 256 KV cache size
|
||||
And 32 as batch size
|
||||
And 2 slots
|
||||
And 64 server max tokens to predict
|
||||
And prometheus compatible metrics exposed
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario: Health
|
||||
Then the server is ready
|
||||
And all slots are idle
|
||||
|
||||
|
||||
Scenario Outline: Completion
|
||||
Given a prompt <prompt>
|
||||
And <n_predict> max tokens to predict
|
||||
And a completion request with no api error
|
||||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
And the completion is <truncated> truncated
|
||||
And <n_prompt> prompt tokens are processed
|
||||
And prometheus metrics are exposed
|
||||
And metric llamacpp:tokens_predicted is <n_predicted>
|
||||
|
||||
Examples: Prompts
|
||||
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
|
||||
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
|
||||
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not |
|
||||
|
||||
Scenario: Completion prompt truncated
|
||||
Given a prompt:
|
||||
"""
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
"""
|
||||
And a completion request with no api error
|
||||
Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
|
||||
And the completion is truncated
|
||||
And 109 prompt tokens are processed
|
||||
|
||||
|
||||
Scenario Outline: OAI Compatibility
|
||||
Given a model <model>
|
||||
And a system prompt <system_prompt>
|
||||
And a user prompt <user_prompt>
|
||||
And <max_tokens> max tokens to predict
|
||||
And streaming is <enable_streaming>
|
||||
Given an OAI compatible chat completions request with no api error
|
||||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
And <n_prompt> prompt tokens are processed
|
||||
And the completion is <truncated> truncated
|
||||
|
||||
Examples: Prompts
|
||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
|
||||
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
|
||||
|
||||
|
||||
Scenario Outline: OAI Compatibility w/ response format
|
||||
Given a model test
|
||||
And a system prompt test
|
||||
And a user prompt test
|
||||
And a response format <response_format>
|
||||
And 10 max tokens to predict
|
||||
Given an OAI compatible chat completions request with no api error
|
||||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
|
||||
Examples: Prompts
|
||||
| response_format | n_predicted | re_content |
|
||||
| {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" |
|
||||
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
|
||||
| {"type": "json_object"} | 10 | \{ " Jacky. |
|
||||
|
||||
|
||||
Scenario: Tokenize / Detokenize
|
||||
When tokenizing:
|
||||
"""
|
||||
What is the capital of France ?
|
||||
"""
|
||||
Then tokens can be detokenized
|
||||
And tokens do not begin with BOS
|
||||
|
||||
Scenario: Tokenize w/ BOS
|
||||
Given adding special tokens
|
||||
When tokenizing:
|
||||
"""
|
||||
What is the capital of Germany?
|
||||
"""
|
||||
Then tokens begin with BOS
|
||||
Given first token is removed
|
||||
Then tokens can be detokenized
|
||||
|
||||
Scenario: Tokenize with pieces
|
||||
When tokenizing with pieces:
|
||||
"""
|
||||
What is the capital of Germany?
|
||||
媽
|
||||
"""
|
||||
Then tokens are given with pieces
|
||||
|
||||
Scenario: Models available
|
||||
Given available models
|
||||
Then 1 models are supported
|
||||
Then model 0 is identified by tinyllama-2
|
||||
Then model 0 is trained on 128 tokens context
|
||||
@@ -1,58 +0,0 @@
|
||||
@llama.cpp
|
||||
@slotsave
|
||||
Feature: llama.cpp server slot management
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||
And prompt caching is enabled
|
||||
And 2 slots
|
||||
And . as slot save path
|
||||
And 2048 KV cache size
|
||||
And 42 as server seed
|
||||
And 24 max tokens to predict
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario: Save and Restore Slot
|
||||
# First prompt in slot 1 should be fully processed
|
||||
Given a user prompt "What is the capital of France?"
|
||||
And using slot id 1
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Lily|cake)
|
||||
And 22 prompt tokens are processed
|
||||
When the slot 1 is saved with filename "slot1.bin"
|
||||
Then the server responds with status code 200
|
||||
# Since we have cache, this should only process the last tokens
|
||||
Given a user prompt "What is the capital of Germany?"
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Thank|special)
|
||||
And 7 prompt tokens are processed
|
||||
# Loading the original cache into slot 0,
|
||||
# we should only be processing 1 prompt token and get the same output
|
||||
When the slot 0 is restored with filename "slot1.bin"
|
||||
Then the server responds with status code 200
|
||||
Given a user prompt "What is the capital of France?"
|
||||
And using slot id 0
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Lily|cake)
|
||||
And 1 prompt tokens are processed
|
||||
# For verification that slot 1 was not corrupted during slot 0 load, same thing
|
||||
Given a user prompt "What is the capital of Germany?"
|
||||
And using slot id 1
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Thank|special)
|
||||
And 1 prompt tokens are processed
|
||||
|
||||
Scenario: Erase Slot
|
||||
Given a user prompt "What is the capital of France?"
|
||||
And using slot id 1
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Lily|cake)
|
||||
And 22 prompt tokens are processed
|
||||
When the slot 1 is erased
|
||||
Then the server responds with status code 200
|
||||
Given a user prompt "What is the capital of France?"
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Lily|cake)
|
||||
And 22 prompt tokens are processed
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,25 +0,0 @@
|
||||
# run with: ./tests.sh --no-skipped --tags wrong_usage
|
||||
@wrong_usage
|
||||
Feature: Wrong usage of llama.cpp server
|
||||
|
||||
#3969 The user must always set --n-predict option
|
||||
# to cap the number of tokens any completion request can generate
|
||||
# or pass n_predict/max_tokens in the request.
|
||||
Scenario: Infinite loop
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||
And 42 as server seed
|
||||
And 2048 KV cache size
|
||||
# Uncomment below to fix the issue
|
||||
#And 64 server max tokens to predict
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
Given a prompt:
|
||||
"""
|
||||
Go to: infinite loop
|
||||
"""
|
||||
# Uncomment below to fix the issue
|
||||
#And 128 max tokens to predict
|
||||
Given concurrent completion requests
|
||||
Then the server is idle
|
||||
Then all prompts are predicted
|
||||
@@ -1,7 +1,7 @@
|
||||
aiohttp~=3.9.3
|
||||
behave~=1.2.6
|
||||
pytest~=8.3.3
|
||||
huggingface_hub~=0.23.2
|
||||
numpy~=1.26.4
|
||||
openai~=1.30.3
|
||||
openai~=1.55.3
|
||||
prometheus-client~=0.20.0
|
||||
requests~=2.32.3
|
||||
|
||||
@@ -4,8 +4,7 @@ set -eu
|
||||
|
||||
if [ $# -lt 1 ]
|
||||
then
|
||||
# Start @llama.cpp scenario
|
||||
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
pytest -v -x
|
||||
else
|
||||
behave "$@"
|
||||
pytest "$@"
|
||||
fi
|
||||
|
||||
48
examples/server/tests/unit/test_basic.py
Normal file
48
examples/server/tests/unit/test_basic.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
def test_server_start_simple():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("GET", "/health")
|
||||
assert res.status_code == 200
|
||||
|
||||
|
||||
def test_server_props():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("GET", "/props")
|
||||
assert res.status_code == 200
|
||||
assert res.body["total_slots"] == server.n_slots
|
||||
|
||||
|
||||
def test_server_models():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("GET", "/models")
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["data"]) == 1
|
||||
assert res.body["data"][0]["id"] == server.model_alias
|
||||
|
||||
def test_load_split_model():
|
||||
global server
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf"
|
||||
server.model_alias = "tinyllama-split"
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": 16,
|
||||
"prompt": "Hello",
|
||||
"temperature": 0.0,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(little|girl)+", res.body["content"])
|
||||
148
examples/server/tests/unit/test_chat_completion.py
Normal file
148
examples/server/tests/unit/test_chat_completion.py
Normal file
@@ -0,0 +1,148 @@
|
||||
import pytest
|
||||
from openai import OpenAI
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
|
||||
[
|
||||
("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
|
||||
]
|
||||
)
|
||||
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"model": model,
|
||||
"max_tokens": max_tokens,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["usage"]["prompt_tokens"] == n_prompt
|
||||
assert res.body["usage"]["completion_tokens"] == n_predicted
|
||||
choice = res.body["choices"][0]
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert match_regex(re_content, choice["message"]["content"])
|
||||
if truncated:
|
||||
assert choice["finish_reason"] == "length"
|
||||
else:
|
||||
assert choice["finish_reason"] == "stop"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
|
||||
[
|
||||
("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
|
||||
]
|
||||
)
|
||||
def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_stream_request("POST", "/chat/completions", data={
|
||||
"model": model,
|
||||
"max_tokens": max_tokens,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"stream": True,
|
||||
})
|
||||
content = ""
|
||||
for data in res:
|
||||
choice = data["choices"][0]
|
||||
if choice["finish_reason"] in ["stop", "length"]:
|
||||
assert data["usage"]["prompt_tokens"] == n_prompt
|
||||
assert data["usage"]["completion_tokens"] == n_predicted
|
||||
assert "content" not in choice["delta"]
|
||||
assert match_regex(re_content, content)
|
||||
# FIXME: not sure why this is incorrect in stream mode
|
||||
# if truncated:
|
||||
# assert choice["finish_reason"] == "length"
|
||||
# else:
|
||||
# assert choice["finish_reason"] == "stop"
|
||||
else:
|
||||
assert choice["finish_reason"] is None
|
||||
content += choice["delta"]["content"]
|
||||
|
||||
|
||||
def test_chat_completion_with_openai_library():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
|
||||
res = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo-instruct",
|
||||
messages=[
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
max_tokens=8,
|
||||
seed=42,
|
||||
temperature=0.8,
|
||||
)
|
||||
print(res)
|
||||
assert res.choices[0].finish_reason == "stop"
|
||||
assert res.choices[0].message.content is not None
|
||||
assert match_regex("(Suddenly)+", res.choices[0].message.content)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
|
||||
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
|
||||
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
|
||||
({"type": "json_object"}, 10, "(\\{|John)+"),
|
||||
({"type": "sound"}, 0, None),
|
||||
# invalid response format (expected to fail)
|
||||
({"type": "json_object", "schema": 123}, 0, None),
|
||||
({"type": "json_object", "schema": {"type": 123}}, 0, None),
|
||||
({"type": "json_object", "schema": {"type": "hiccup"}}, 0, None),
|
||||
])
|
||||
def test_completion_with_response_format(response_format: dict, n_predicted: int, re_content: str | None):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": n_predicted,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
{"role": "user", "content": "Write an example"},
|
||||
],
|
||||
"response_format": response_format,
|
||||
})
|
||||
if re_content is not None:
|
||||
assert res.status_code == 200
|
||||
choice = res.body["choices"][0]
|
||||
assert match_regex(re_content, choice["message"]["content"])
|
||||
else:
|
||||
assert res.status_code != 200
|
||||
assert "error" in res.body
|
||||
|
||||
|
||||
@pytest.mark.parametrize("messages", [
|
||||
None,
|
||||
"string",
|
||||
[123],
|
||||
[{}],
|
||||
[{"role": 123}],
|
||||
[{"role": "system", "content": 123}],
|
||||
# [{"content": "hello"}], # TODO: should not be a valid case
|
||||
[{"role": "system", "content": "test"}, {}],
|
||||
])
|
||||
def test_invalid_chat_completion_req(messages):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"messages": messages,
|
||||
})
|
||||
assert res.status_code == 400 or res.status_code == 500
|
||||
assert "error" in res.body
|
||||
223
examples/server/tests/unit/test_completion.py
Normal file
223
examples/server/tests/unit/test_completion.py
Normal file
@@ -0,0 +1,223 @@
|
||||
import pytest
|
||||
import time
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
|
||||
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
|
||||
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
|
||||
])
|
||||
def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": n_predict,
|
||||
"prompt": prompt,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["timings"]["prompt_n"] == n_prompt
|
||||
assert res.body["timings"]["predicted_n"] == n_predicted
|
||||
assert res.body["truncated"] == truncated
|
||||
assert match_regex(re_content, res.body["content"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
|
||||
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
|
||||
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
|
||||
])
|
||||
def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_stream_request("POST", "/completion", data={
|
||||
"n_predict": n_predict,
|
||||
"prompt": prompt,
|
||||
"stream": True,
|
||||
})
|
||||
content = ""
|
||||
for data in res:
|
||||
if data["stop"]:
|
||||
assert data["timings"]["prompt_n"] == n_prompt
|
||||
assert data["timings"]["predicted_n"] == n_predicted
|
||||
assert data["truncated"] == truncated
|
||||
assert match_regex(re_content, content)
|
||||
else:
|
||||
content += data["content"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_slots", [1, 2])
|
||||
def test_consistent_result_same_seed(n_slots: int):
|
||||
global server
|
||||
server.n_slots = n_slots
|
||||
server.start()
|
||||
last_res = None
|
||||
for _ in range(4):
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"seed": 42,
|
||||
"temperature": 1.0,
|
||||
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
|
||||
})
|
||||
if last_res is not None:
|
||||
assert res.body["content"] == last_res.body["content"]
|
||||
last_res = res
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_slots", [1, 2])
|
||||
def test_different_result_different_seed(n_slots: int):
|
||||
global server
|
||||
server.n_slots = n_slots
|
||||
server.start()
|
||||
last_res = None
|
||||
for seed in range(4):
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"seed": seed,
|
||||
"temperature": 1.0,
|
||||
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
|
||||
})
|
||||
if last_res is not None:
|
||||
assert res.body["content"] != last_res.body["content"]
|
||||
last_res = res
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_batch", [16, 32])
|
||||
@pytest.mark.parametrize("temperature", [0.0, 1.0])
|
||||
def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
|
||||
global server
|
||||
server.n_batch = n_batch
|
||||
server.start()
|
||||
last_res = None
|
||||
for _ in range(4):
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"seed": 42,
|
||||
"temperature": temperature,
|
||||
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
|
||||
})
|
||||
if last_res is not None:
|
||||
assert res.body["content"] == last_res.body["content"]
|
||||
last_res = res
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test fails on linux, need to be fixed")
|
||||
def test_cache_vs_nocache_prompt():
|
||||
global server
|
||||
server.start()
|
||||
res_cache = server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"seed": 42,
|
||||
"temperature": 1.0,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
res_no_cache = server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"seed": 42,
|
||||
"temperature": 1.0,
|
||||
"cache_prompt": False,
|
||||
})
|
||||
assert res_cache.body["content"] == res_no_cache.body["content"]
|
||||
|
||||
|
||||
def test_completion_with_tokens_input():
|
||||
global server
|
||||
server.temperature = 0.0
|
||||
server.start()
|
||||
prompt_str = "I believe the meaning of life is"
|
||||
res = server.make_request("POST", "/tokenize", data={
|
||||
"content": prompt_str,
|
||||
"add_special": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
tokens = res.body["tokens"]
|
||||
|
||||
# single completion
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": tokens,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert type(res.body["content"]) == str
|
||||
|
||||
# batch completion
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": [tokens, tokens],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert type(res.body) == list
|
||||
assert len(res.body) == 2
|
||||
assert res.body[0]["content"] == res.body[1]["content"]
|
||||
|
||||
# mixed string and tokens
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": [tokens, prompt_str],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert type(res.body) == list
|
||||
assert len(res.body) == 2
|
||||
assert res.body[0]["content"] == res.body[1]["content"]
|
||||
|
||||
# mixed string and tokens in one sequence
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert type(res.body["content"]) == str
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_slots,n_requests", [
|
||||
(1, 3),
|
||||
(2, 2),
|
||||
(2, 4),
|
||||
(4, 2), # some slots must be idle
|
||||
(4, 6),
|
||||
])
|
||||
def test_completion_parallel_slots(n_slots: int, n_requests: int):
|
||||
global server
|
||||
server.n_slots = n_slots
|
||||
server.temperature = 0.0
|
||||
server.start()
|
||||
|
||||
PROMPTS = [
|
||||
("Write a very long book.", "(very|special|big)+"),
|
||||
("Write another a poem.", "(small|house)+"),
|
||||
("What is LLM?", "(Dad|said)+"),
|
||||
("The sky is blue and I love it.", "(climb|leaf)+"),
|
||||
("Write another very long music lyrics.", "(friends|step|sky)+"),
|
||||
("Write a very long joke.", "(cat|Whiskers)+"),
|
||||
]
|
||||
def check_slots_status():
|
||||
should_all_slots_busy = n_requests >= n_slots
|
||||
time.sleep(0.1)
|
||||
res = server.make_request("GET", "/slots")
|
||||
n_busy = sum([1 for slot in res.body if slot["is_processing"]])
|
||||
if should_all_slots_busy:
|
||||
assert n_busy == n_slots
|
||||
else:
|
||||
assert n_busy <= n_slots
|
||||
|
||||
tasks = []
|
||||
for i in range(n_requests):
|
||||
prompt, re_content = PROMPTS[i % len(PROMPTS)]
|
||||
tasks.append((server.make_request, ("POST", "/completion", {
|
||||
"prompt": prompt,
|
||||
"seed": 42,
|
||||
"temperature": 1.0,
|
||||
})))
|
||||
tasks.append((check_slots_status, ()))
|
||||
results = parallel_function_calls(tasks)
|
||||
|
||||
# check results
|
||||
for i in range(n_requests):
|
||||
prompt, re_content = PROMPTS[i % len(PROMPTS)]
|
||||
res = results[i]
|
||||
assert res.status_code == 200
|
||||
assert type(res.body["content"]) == str
|
||||
assert len(res.body["content"]) > 10
|
||||
# FIXME: the result is not deterministic when using other slot than slot 0
|
||||
# assert match_regex(re_content, res.body["content"])
|
||||
67
examples/server/tests/unit/test_ctx_shift.py
Normal file
67
examples/server/tests/unit/test_ctx_shift.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
LONG_TEXT = """
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
""".strip()
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
server.n_ctx = 256
|
||||
server.n_slots = 2
|
||||
|
||||
|
||||
def test_ctx_shift_enabled():
|
||||
# the prompt is 301 tokens
|
||||
# the slot context is 256/2 = 128 tokens
|
||||
# the prompt is truncated to keep the last 109 tokens
|
||||
# 64 tokens are generated thanks to shifting the context when it gets full
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": 64,
|
||||
"prompt": LONG_TEXT,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["timings"]["prompt_n"] == 109
|
||||
assert res.body["timings"]["predicted_n"] == 64
|
||||
assert res.body["truncated"] is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
|
||||
(64, 64, False),
|
||||
(-1, 120, True),
|
||||
])
|
||||
def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
|
||||
global server
|
||||
server.disable_ctx_shift = True
|
||||
server.n_predict = -1
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": n_predict,
|
||||
"prompt": "Hi how are you",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["timings"]["predicted_n"] == n_token_output
|
||||
assert res.body["truncated"] == truncated
|
||||
|
||||
|
||||
def test_ctx_shift_disabled_long_prompt():
|
||||
global server
|
||||
server.disable_ctx_shift = True
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": 64,
|
||||
"prompt": LONG_TEXT,
|
||||
})
|
||||
assert res.status_code != 200
|
||||
assert "error" in res.body
|
||||
assert "exceeds the available context size" in res.body["error"]["message"]
|
||||
99
examples/server/tests/unit/test_embedding.py
Normal file
99
examples/server/tests/unit/test_embedding.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import pytest
|
||||
from openai import OpenAI
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.bert_bge_small()
|
||||
|
||||
EPSILON = 1e-3
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.bert_bge_small()
|
||||
|
||||
|
||||
def test_embedding_single():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/embeddings", data={
|
||||
"input": "I believe the meaning of life is",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body['data']) == 1
|
||||
assert 'embedding' in res.body['data'][0]
|
||||
assert len(res.body['data'][0]['embedding']) > 1
|
||||
|
||||
# make sure embedding vector is normalized
|
||||
assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
|
||||
|
||||
|
||||
def test_embedding_multiple():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/embeddings", data={
|
||||
"input": [
|
||||
"I believe the meaning of life is",
|
||||
"Write a joke about AI from a very long prompt which will not be truncated",
|
||||
"This is a test",
|
||||
"This is another test",
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body['data']) == 4
|
||||
for d in res.body['data']:
|
||||
assert 'embedding' in d
|
||||
assert len(d['embedding']) > 1
|
||||
|
||||
|
||||
def test_embedding_openai_library_single():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
|
||||
res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
|
||||
assert len(res.data) == 1
|
||||
assert len(res.data[0].embedding) > 1
|
||||
|
||||
|
||||
def test_embedding_openai_library_multiple():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
|
||||
res = client.embeddings.create(model="text-embedding-3-small", input=[
|
||||
"I believe the meaning of life is",
|
||||
"Write a joke about AI from a very long prompt which will not be truncated",
|
||||
"This is a test",
|
||||
"This is another test",
|
||||
])
|
||||
assert len(res.data) == 4
|
||||
for d in res.data:
|
||||
assert len(d.embedding) > 1
|
||||
|
||||
|
||||
def test_embedding_error_prompt_too_long():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/embeddings", data={
|
||||
"input": "This is a test " * 512,
|
||||
})
|
||||
assert res.status_code != 200
|
||||
assert "too large" in res.body["error"]["message"]
|
||||
|
||||
|
||||
def test_same_prompt_give_same_result():
|
||||
server.start()
|
||||
res = server.make_request("POST", "/embeddings", data={
|
||||
"input": [
|
||||
"I believe the meaning of life is",
|
||||
"I believe the meaning of life is",
|
||||
"I believe the meaning of life is",
|
||||
"I believe the meaning of life is",
|
||||
"I believe the meaning of life is",
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body['data']) == 5
|
||||
for i in range(1, len(res.body['data'])):
|
||||
v0 = res.body['data'][0]['embedding']
|
||||
vi = res.body['data'][i]['embedding']
|
||||
for x, y in zip(v0, vi):
|
||||
assert abs(x - y) < EPSILON
|
||||
57
examples/server/tests/unit/test_infill.py
Normal file
57
examples/server/tests/unit/test_infill.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama_infill()
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama_infill()
|
||||
|
||||
|
||||
def test_infill_without_input_extra():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"prompt": "Complete this",
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
|
||||
|
||||
|
||||
def test_infill_with_input_extra():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"prompt": "Complete this",
|
||||
"input_extra": [{
|
||||
"filename": "llama.h",
|
||||
"text": "LLAMA_API int32_t llama_n_threads();\n"
|
||||
}],
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_extra", [
|
||||
{},
|
||||
{"filename": "ok"},
|
||||
{"filename": 123},
|
||||
{"filename": 123, "text": "abc"},
|
||||
{"filename": 123, "text": 456},
|
||||
])
|
||||
def test_invalid_input_extra_req(input_extra):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"prompt": "Complete this",
|
||||
"input_extra": [input_extra],
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 400
|
||||
assert "error" in res.body
|
||||
42
examples/server/tests/unit/test_lora.py
Normal file
42
examples/server/tests/unit/test_lora.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import pytest
|
||||
import os
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.stories15m_moe()
|
||||
|
||||
LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.stories15m_moe()
|
||||
# download lora file if needed
|
||||
file_name = LORA_FILE_URL.split('/').pop()
|
||||
lora_file = f'../../../{file_name}'
|
||||
if not os.path.exists(lora_file):
|
||||
print(f"Downloading {LORA_FILE_URL} to {lora_file}")
|
||||
with open(lora_file, 'wb') as f:
|
||||
f.write(requests.get(LORA_FILE_URL).content)
|
||||
print(f"Done downloading lora file")
|
||||
server.lora_files = [lora_file]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("scale,re_content", [
|
||||
# without applying lora, the model should behave like a bedtime story generator
|
||||
(0.0, "(little|girl|three|years|old)+"),
|
||||
# with lora, the model should behave like a Shakespearean text generator
|
||||
(1.0, "(eye|love|glass|sun)+"),
|
||||
])
|
||||
def test_lora(scale: float, re_content: str):
|
||||
global server
|
||||
server.start()
|
||||
res_lora_control = server.make_request("POST", "/lora-adapters", data=[
|
||||
{"id": 0, "scale": scale}
|
||||
])
|
||||
assert res_lora_control.status_code == 200
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "Look in thy glass",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex(re_content, res.body["content"])
|
||||
|
||||
55
examples/server/tests/unit/test_rerank.py
Normal file
55
examples/server/tests/unit/test_rerank.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.jina_reranker_tiny()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.jina_reranker_tiny()
|
||||
|
||||
|
||||
def test_rerank():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/rerank", data={
|
||||
"query": "Machine learning is",
|
||||
"documents": [
|
||||
"A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
|
||||
"Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
|
||||
"Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
|
||||
"Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
|
||||
]
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["results"]) == 4
|
||||
|
||||
most_relevant = res.body["results"][0]
|
||||
least_relevant = res.body["results"][0]
|
||||
for doc in res.body["results"]:
|
||||
if doc["relevance_score"] > most_relevant["relevance_score"]:
|
||||
most_relevant = doc
|
||||
if doc["relevance_score"] < least_relevant["relevance_score"]:
|
||||
least_relevant = doc
|
||||
|
||||
assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
|
||||
assert most_relevant["index"] == 2
|
||||
assert least_relevant["index"] == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("documents", [
|
||||
[],
|
||||
None,
|
||||
123,
|
||||
[1, 2, 3],
|
||||
])
|
||||
def test_invalid_rerank_req(documents):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/rerank", data={
|
||||
"query": "Machine learning is",
|
||||
"documents": documents,
|
||||
})
|
||||
assert res.status_code == 400
|
||||
assert "error" in res.body
|
||||
83
examples/server/tests/unit/test_security.py
Normal file
83
examples/server/tests/unit/test_security.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import pytest
|
||||
from openai import OpenAI
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
TEST_API_KEY = "sk-this-is-the-secret-key"
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
server.api_key = TEST_API_KEY
|
||||
|
||||
|
||||
@pytest.mark.parametrize("endpoint", ["/health", "/models"])
|
||||
def test_access_public_endpoint(endpoint: str):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("GET", endpoint)
|
||||
assert res.status_code == 200
|
||||
assert "error" not in res.body
|
||||
|
||||
|
||||
@pytest.mark.parametrize("api_key", [None, "invalid-key"])
|
||||
def test_incorrect_api_key(api_key: str):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completions", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
}, headers={
|
||||
"Authorization": f"Bearer {api_key}" if api_key else None,
|
||||
})
|
||||
assert res.status_code == 401
|
||||
assert "error" in res.body
|
||||
assert res.body["error"]["type"] == "authentication_error"
|
||||
|
||||
|
||||
def test_correct_api_key():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completions", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
}, headers={
|
||||
"Authorization": f"Bearer {TEST_API_KEY}",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "error" not in res.body
|
||||
assert "content" in res.body
|
||||
|
||||
|
||||
def test_openai_library_correct_api_key():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key=TEST_API_KEY, base_url=f"http://{server.server_host}:{server.server_port}")
|
||||
res = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a chatbot."},
|
||||
{"role": "user", "content": "What is the meaning of life?"},
|
||||
],
|
||||
)
|
||||
assert len(res.choices) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("origin,cors_header,cors_header_value", [
|
||||
("localhost", "Access-Control-Allow-Origin", "localhost"),
|
||||
("web.mydomain.fr", "Access-Control-Allow-Origin", "web.mydomain.fr"),
|
||||
("origin", "Access-Control-Allow-Credentials", "true"),
|
||||
("web.mydomain.fr", "Access-Control-Allow-Methods", "GET, POST"),
|
||||
("web.mydomain.fr", "Access-Control-Allow-Headers", "*"),
|
||||
])
|
||||
def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("OPTIONS", "/completions", headers={
|
||||
"Origin": origin,
|
||||
"Access-Control-Request-Method": "POST",
|
||||
"Access-Control-Request-Headers": "Authorization",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert cors_header in res.headers
|
||||
assert res.headers[cors_header] == cors_header_value
|
||||
98
examples/server/tests/unit/test_slot_save.py
Normal file
98
examples/server/tests/unit/test_slot_save.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
server.slot_save_path = "./tmp"
|
||||
server.temperature = 0.0
|
||||
|
||||
|
||||
def test_slot_save_restore():
|
||||
global server
|
||||
server.start()
|
||||
|
||||
# First prompt in slot 1 should be fully processed
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "What is the capital of France?",
|
||||
"id_slot": 1,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(Whiskers|Flana)+", res.body["content"])
|
||||
assert res.body["timings"]["prompt_n"] == 21 # all tokens are processed
|
||||
|
||||
# Save state of slot 1
|
||||
res = server.make_request("POST", "/slots/1?action=save", data={
|
||||
"filename": "slot1.bin",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["n_saved"] == 84
|
||||
|
||||
# Since we have cache, this should only process the last tokens
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "What is the capital of Germany?",
|
||||
"id_slot": 1,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(Jack|said)+", res.body["content"])
|
||||
assert res.body["timings"]["prompt_n"] == 6 # only different part is processed
|
||||
|
||||
# Loading the saved cache into slot 0
|
||||
res = server.make_request("POST", "/slots/0?action=restore", data={
|
||||
"filename": "slot1.bin",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["n_restored"] == 84
|
||||
|
||||
# Since we have cache, slot 0 should only process the last tokens
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "What is the capital of Germany?",
|
||||
"id_slot": 0,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(Jack|said)+", res.body["content"])
|
||||
assert res.body["timings"]["prompt_n"] == 6 # only different part is processed
|
||||
|
||||
# For verification that slot 1 was not corrupted during slot 0 load, same thing should work
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "What is the capital of Germany?",
|
||||
"id_slot": 1,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(Jack|said)+", res.body["content"])
|
||||
assert res.body["timings"]["prompt_n"] == 1
|
||||
|
||||
|
||||
def test_slot_erase():
|
||||
global server
|
||||
server.start()
|
||||
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "What is the capital of France?",
|
||||
"id_slot": 1,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(Whiskers|Flana)+", res.body["content"])
|
||||
assert res.body["timings"]["prompt_n"] == 21 # all tokens are processed
|
||||
|
||||
# erase slot 1
|
||||
res = server.make_request("POST", "/slots/1?action=erase")
|
||||
assert res.status_code == 200
|
||||
|
||||
# re-run the same prompt, it should process all tokens again
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "What is the capital of France?",
|
||||
"id_slot": 1,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(Whiskers|Flana)+", res.body["content"])
|
||||
assert res.body["timings"]["prompt_n"] == 21 # all tokens are processed
|
||||
103
examples/server/tests/unit/test_speculative.py
Normal file
103
examples/server/tests/unit/test_speculative.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
# We use a F16 MOE gguf as main model, and q4_0 as draft model
|
||||
|
||||
server = ServerPreset.stories15m_moe()
|
||||
|
||||
MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf"
|
||||
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.stories15m_moe()
|
||||
# download draft model file if needed
|
||||
file_name = MODEL_DRAFT_FILE_URL.split('/').pop()
|
||||
model_draft_file = f'../../../{file_name}'
|
||||
if not os.path.exists(model_draft_file):
|
||||
print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}")
|
||||
with open(model_draft_file, 'wb') as f:
|
||||
f.write(requests.get(MODEL_DRAFT_FILE_URL).content)
|
||||
print(f"Done downloading draft model file")
|
||||
# set default values
|
||||
server.model_draft = model_draft_file
|
||||
server.draft_min = 4
|
||||
server.draft_max = 8
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def fixture_create_server():
|
||||
return create_server()
|
||||
|
||||
|
||||
def test_with_and_without_draft():
|
||||
global server
|
||||
server.model_draft = None # disable draft model
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
content_no_draft = res.body["content"]
|
||||
server.stop()
|
||||
|
||||
# create new server with draft model
|
||||
create_server()
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
content_draft = res.body["content"]
|
||||
|
||||
assert content_no_draft == content_draft
|
||||
|
||||
|
||||
def test_different_draft_min_draft_max():
|
||||
global server
|
||||
test_values = [
|
||||
(1, 2),
|
||||
(1, 4),
|
||||
(4, 8),
|
||||
(4, 12),
|
||||
(8, 16),
|
||||
]
|
||||
last_content = None
|
||||
for draft_min, draft_max in test_values:
|
||||
server.stop()
|
||||
server.draft_min = draft_min
|
||||
server.draft_max = draft_max
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
if last_content is not None:
|
||||
assert last_content == res.body["content"]
|
||||
last_content = res.body["content"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_slots,n_requests", [
|
||||
(1, 2),
|
||||
(2, 2),
|
||||
])
|
||||
def test_multi_requests_parallel(n_slots: int, n_requests: int):
|
||||
global server
|
||||
server.n_slots = n_slots
|
||||
server.start()
|
||||
tasks = []
|
||||
for _ in range(n_requests):
|
||||
tasks.append((server.make_request, ("POST", "/completion", {
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
})))
|
||||
results = parallel_function_calls(tasks)
|
||||
for res in results:
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(wise|kind|owl|answer)+", res.body["content"])
|
||||
59
examples/server/tests/unit/test_tokenize.py
Normal file
59
examples/server/tests/unit/test_tokenize.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
def test_tokenize_detokenize():
|
||||
global server
|
||||
server.start()
|
||||
# tokenize
|
||||
content = "What is the capital of France ?"
|
||||
res_tok = server.make_request("POST", "/tokenize", data={
|
||||
"content": content
|
||||
})
|
||||
assert res_tok.status_code == 200
|
||||
assert len(res_tok.body["tokens"]) > 5
|
||||
# detokenize
|
||||
res_detok = server.make_request("POST", "/detokenize", data={
|
||||
"tokens": res_tok.body["tokens"],
|
||||
})
|
||||
assert res_detok.status_code == 200
|
||||
assert res_detok.body["content"].strip() == content
|
||||
|
||||
|
||||
def test_tokenize_with_bos():
|
||||
global server
|
||||
server.start()
|
||||
# tokenize
|
||||
content = "What is the capital of France ?"
|
||||
bosId = 1
|
||||
res_tok = server.make_request("POST", "/tokenize", data={
|
||||
"content": content,
|
||||
"add_special": True,
|
||||
})
|
||||
assert res_tok.status_code == 200
|
||||
assert res_tok.body["tokens"][0] == bosId
|
||||
|
||||
|
||||
def test_tokenize_with_pieces():
|
||||
global server
|
||||
server.start()
|
||||
# tokenize
|
||||
content = "This is a test string with unicode 媽 and emoji 🤗"
|
||||
res_tok = server.make_request("POST", "/tokenize", data={
|
||||
"content": content,
|
||||
"with_pieces": True,
|
||||
})
|
||||
assert res_tok.status_code == 200
|
||||
for token in res_tok.body["tokens"]:
|
||||
assert "id" in token
|
||||
assert token["id"] > 0
|
||||
assert "piece" in token
|
||||
assert len(token["piece"]) > 0
|
||||
371
examples/server/tests/utils.py
Normal file
371
examples/server/tests/utils.py
Normal file
@@ -0,0 +1,371 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# type: ignore[reportUnusedImport]
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import sys
|
||||
import requests
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
ContextManager,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Literal,
|
||||
Tuple,
|
||||
Set,
|
||||
)
|
||||
from re import RegexFlag
|
||||
|
||||
|
||||
class ServerResponse:
|
||||
headers: dict
|
||||
status_code: int
|
||||
body: dict | Any
|
||||
|
||||
|
||||
class ServerProcess:
|
||||
# default options
|
||||
debug: bool = False
|
||||
server_port: int = 8080
|
||||
server_host: str = "127.0.0.1"
|
||||
model_hf_repo: str = "ggml-org/models"
|
||||
model_hf_file: str = "tinyllamas/stories260K.gguf"
|
||||
model_alias: str = "tinyllama-2"
|
||||
temperature: float = 0.8
|
||||
seed: int = 42
|
||||
|
||||
# custom options
|
||||
model_alias: str | None = None
|
||||
model_url: str | None = None
|
||||
model_file: str | None = None
|
||||
model_draft: str | None = None
|
||||
n_threads: int | None = None
|
||||
n_gpu_layer: int | None = None
|
||||
n_batch: int | None = None
|
||||
n_ubatch: int | None = None
|
||||
n_ctx: int | None = None
|
||||
n_ga: int | None = None
|
||||
n_ga_w: int | None = None
|
||||
n_predict: int | None = None
|
||||
n_prompts: int | None = 0
|
||||
slot_save_path: str | None = None
|
||||
id_slot: int | None = None
|
||||
cache_prompt: bool | None = None
|
||||
n_slots: int | None = None
|
||||
server_continuous_batching: bool | None = False
|
||||
server_embeddings: bool | None = False
|
||||
server_reranking: bool | None = False
|
||||
server_metrics: bool | None = False
|
||||
draft: int | None = None
|
||||
api_key: str | None = None
|
||||
response_format: str | None = None
|
||||
lora_files: List[str] | None = None
|
||||
disable_ctx_shift: int | None = False
|
||||
draft_min: int | None = None
|
||||
draft_max: int | None = None
|
||||
|
||||
# session variables
|
||||
process: subprocess.Popen | None = None
|
||||
|
||||
def __init__(self):
|
||||
if "N_GPU_LAYERS" in os.environ:
|
||||
self.n_gpu_layer = int(os.environ["N_GPU_LAYERS"])
|
||||
if "DEBUG" in os.environ:
|
||||
self.debug = True
|
||||
if "PORT" in os.environ:
|
||||
self.server_port = int(os.environ["PORT"])
|
||||
|
||||
def start(self, timeout_seconds: int = 10) -> None:
|
||||
if "LLAMA_SERVER_BIN_PATH" in os.environ:
|
||||
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
|
||||
elif os.name == "nt":
|
||||
server_path = "../../../build/bin/Release/llama-server.exe"
|
||||
else:
|
||||
server_path = "../../../build/bin/llama-server"
|
||||
server_args = [
|
||||
"--slots", # requires to get slot status via /slots endpoint
|
||||
"--host",
|
||||
self.server_host,
|
||||
"--port",
|
||||
self.server_port,
|
||||
"--temp",
|
||||
self.temperature,
|
||||
"--seed",
|
||||
self.seed,
|
||||
]
|
||||
if self.model_file:
|
||||
server_args.extend(["--model", self.model_file])
|
||||
if self.model_url:
|
||||
server_args.extend(["--model-url", self.model_url])
|
||||
if self.model_draft:
|
||||
server_args.extend(["--model-draft", self.model_draft])
|
||||
if self.model_hf_repo:
|
||||
server_args.extend(["--hf-repo", self.model_hf_repo])
|
||||
if self.model_hf_file:
|
||||
server_args.extend(["--hf-file", self.model_hf_file])
|
||||
if self.n_batch:
|
||||
server_args.extend(["--batch-size", self.n_batch])
|
||||
if self.n_ubatch:
|
||||
server_args.extend(["--ubatch-size", self.n_ubatch])
|
||||
if self.n_threads:
|
||||
server_args.extend(["--threads", self.n_threads])
|
||||
if self.n_gpu_layer:
|
||||
server_args.extend(["--n-gpu-layers", self.n_gpu_layer])
|
||||
if self.draft is not None:
|
||||
server_args.extend(["--draft", self.draft])
|
||||
if self.server_continuous_batching:
|
||||
server_args.append("--cont-batching")
|
||||
if self.server_embeddings:
|
||||
server_args.append("--embedding")
|
||||
if self.server_reranking:
|
||||
server_args.append("--reranking")
|
||||
if self.server_metrics:
|
||||
server_args.append("--metrics")
|
||||
if self.model_alias:
|
||||
server_args.extend(["--alias", self.model_alias])
|
||||
if self.n_ctx:
|
||||
server_args.extend(["--ctx-size", self.n_ctx])
|
||||
if self.n_slots:
|
||||
server_args.extend(["--parallel", self.n_slots])
|
||||
if self.n_predict:
|
||||
server_args.extend(["--n-predict", self.n_predict])
|
||||
if self.slot_save_path:
|
||||
server_args.extend(["--slot-save-path", self.slot_save_path])
|
||||
if self.n_ga:
|
||||
server_args.extend(["--grp-attn-n", self.n_ga])
|
||||
if self.n_ga_w:
|
||||
server_args.extend(["--grp-attn-w", self.n_ga_w])
|
||||
if self.debug:
|
||||
server_args.append("--verbose")
|
||||
if self.lora_files:
|
||||
for lora_file in self.lora_files:
|
||||
server_args.extend(["--lora", lora_file])
|
||||
if self.disable_ctx_shift:
|
||||
server_args.extend(["--no-context-shift"])
|
||||
if self.api_key:
|
||||
server_args.extend(["--api-key", self.api_key])
|
||||
if self.draft_max:
|
||||
server_args.extend(["--draft-max", self.draft_max])
|
||||
if self.draft_min:
|
||||
server_args.extend(["--draft-min", self.draft_min])
|
||||
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"bench: starting server with: {' '.join(args)}")
|
||||
|
||||
flags = 0
|
||||
if "nt" == os.name:
|
||||
flags |= subprocess.DETACHED_PROCESS
|
||||
flags |= subprocess.CREATE_NEW_PROCESS_GROUP
|
||||
flags |= subprocess.CREATE_NO_WINDOW
|
||||
|
||||
self.process = subprocess.Popen(
|
||||
[str(arg) for arg in [server_path, *server_args]],
|
||||
creationflags=flags,
|
||||
stdout=sys.stdout,
|
||||
stderr=sys.stdout,
|
||||
env={**os.environ, "LLAMA_CACHE": "tmp"},
|
||||
)
|
||||
server_instances.add(self)
|
||||
|
||||
print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
|
||||
|
||||
# wait for server to start
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout_seconds:
|
||||
try:
|
||||
response = self.make_request("GET", "/slots", headers={
|
||||
"Authorization": f"Bearer {self.api_key}" if self.api_key else None
|
||||
})
|
||||
if response.status_code == 200:
|
||||
self.ready = True
|
||||
return # server is ready
|
||||
except Exception as e:
|
||||
pass
|
||||
print(f"Waiting for server to start...")
|
||||
time.sleep(0.5)
|
||||
raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
|
||||
|
||||
def stop(self) -> None:
|
||||
if self in server_instances:
|
||||
server_instances.remove(self)
|
||||
if self.process:
|
||||
print(f"Stopping server with pid={self.process.pid}")
|
||||
self.process.kill()
|
||||
self.process = None
|
||||
|
||||
def make_request(
|
||||
self,
|
||||
method: str,
|
||||
path: str,
|
||||
data: dict | Any | None = None,
|
||||
headers: dict | None = None,
|
||||
) -> ServerResponse:
|
||||
url = f"http://{self.server_host}:{self.server_port}{path}"
|
||||
parse_body = False
|
||||
if method == "GET":
|
||||
response = requests.get(url, headers=headers)
|
||||
parse_body = True
|
||||
elif method == "POST":
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
parse_body = True
|
||||
elif method == "OPTIONS":
|
||||
response = requests.options(url, headers=headers)
|
||||
else:
|
||||
raise ValueError(f"Unimplemented method: {method}")
|
||||
result = ServerResponse()
|
||||
result.headers = dict(response.headers)
|
||||
result.status_code = response.status_code
|
||||
result.body = response.json() if parse_body else None
|
||||
print("Response from server", result.body)
|
||||
return result
|
||||
|
||||
def make_stream_request(
|
||||
self,
|
||||
method: str,
|
||||
path: str,
|
||||
data: dict | None = None,
|
||||
headers: dict | None = None,
|
||||
) -> Iterator[dict]:
|
||||
url = f"http://{self.server_host}:{self.server_port}{path}"
|
||||
if method == "POST":
|
||||
response = requests.post(url, headers=headers, json=data, stream=True)
|
||||
else:
|
||||
raise ValueError(f"Unimplemented method: {method}")
|
||||
for line_bytes in response.iter_lines():
|
||||
line = line_bytes.decode("utf-8")
|
||||
if '[DONE]' in line:
|
||||
break
|
||||
elif line.startswith('data: '):
|
||||
data = json.loads(line[6:])
|
||||
print("Partial response from server", data)
|
||||
yield data
|
||||
|
||||
|
||||
server_instances: Set[ServerProcess] = set()
|
||||
|
||||
|
||||
class ServerPreset:
|
||||
@staticmethod
|
||||
def tinyllama2() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "tinyllamas/stories260K.gguf"
|
||||
server.model_alias = "tinyllama-2"
|
||||
server.n_ctx = 256
|
||||
server.n_batch = 32
|
||||
server.n_slots = 2
|
||||
server.n_predict = 64
|
||||
server.seed = 42
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def bert_bge_small() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
server.n_ctx = 512
|
||||
server.n_batch = 128
|
||||
server.n_ubatch = 128
|
||||
server.n_slots = 2
|
||||
server.seed = 42
|
||||
server.server_embeddings = True
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def tinyllama_infill() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
|
||||
server.model_alias = "tinyllama-infill"
|
||||
server.n_ctx = 2048
|
||||
server.n_batch = 1024
|
||||
server.n_slots = 1
|
||||
server.n_predict = 64
|
||||
server.temperature = 0.0
|
||||
server.seed = 42
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def stories15m_moe() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "ggml-org/stories15M_MOE"
|
||||
server.model_hf_file = "stories15M_MOE-F16.gguf"
|
||||
server.model_alias = "stories15m-moe"
|
||||
server.n_ctx = 2048
|
||||
server.n_batch = 1024
|
||||
server.n_slots = 1
|
||||
server.n_predict = 64
|
||||
server.temperature = 0.0
|
||||
server.seed = 42
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def jina_reranker_tiny() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
|
||||
server.model_alias = "jina-reranker"
|
||||
server.n_ctx = 512
|
||||
server.n_batch = 512
|
||||
server.n_slots = 1
|
||||
server.seed = 42
|
||||
server.server_reranking = True
|
||||
return server
|
||||
|
||||
|
||||
def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
|
||||
"""
|
||||
Run multiple functions in parallel and return results in the same order as calls. Equivalent to Promise.all in JS.
|
||||
|
||||
Example usage:
|
||||
|
||||
results = parallel_function_calls([
|
||||
(func1, (arg1, arg2)),
|
||||
(func2, (arg3, arg4)),
|
||||
])
|
||||
"""
|
||||
results = [None] * len(function_list)
|
||||
exceptions = []
|
||||
|
||||
def worker(index, func, args):
|
||||
try:
|
||||
result = func(*args)
|
||||
results[index] = result
|
||||
except Exception as e:
|
||||
exceptions.append((index, str(e)))
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = []
|
||||
for i, (func, args) in enumerate(function_list):
|
||||
future = executor.submit(worker, i, func, args)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in as_completed(futures):
|
||||
pass
|
||||
|
||||
# Check if there were any exceptions
|
||||
if exceptions:
|
||||
print("Exceptions occurred:")
|
||||
for index, error in exceptions:
|
||||
print(f"Function at index {index}: {error}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def match_regex(regex: str, text: str) -> bool:
|
||||
return (
|
||||
re.compile(
|
||||
regex, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL
|
||||
).search(text)
|
||||
is not None
|
||||
)
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-simple-chat)
|
||||
add_executable(${TARGET} simple-chat.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-simple)
|
||||
add_executable(${TARGET} simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
|
||||
|
||||
```bash
|
||||
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
|
||||
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
|
||||
|
||||
...
|
||||
|
||||
|
||||
@@ -2,4 +2,4 @@ set(TARGET llama-speculative-simple)
|
||||
add_executable(${TARGET} speculative-simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -70,13 +70,13 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> inp;
|
||||
inp = common_tokenize(ctx_tgt, params.prompt, true, true);
|
||||
|
||||
if (llama_n_ctx(ctx_tgt) < (int) inp.size()) {
|
||||
if (llama_n_ctx(ctx_tgt) < (uint32_t) inp.size()) {
|
||||
LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (llama_n_batch(ctx_tgt) < (int) inp.size()) {
|
||||
if (llama_n_batch(ctx_tgt) < (uint32_t) inp.size()) {
|
||||
LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
|
||||
|
||||
return 1;
|
||||
@@ -117,7 +117,8 @@ int main(int argc, char ** argv) {
|
||||
llama_token id_last = inp.back();
|
||||
|
||||
// all tokens currently in the target context
|
||||
auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1);
|
||||
llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
|
||||
prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
|
||||
|
||||
int n_past = inp.size() - 1;
|
||||
|
||||
@@ -154,7 +155,7 @@ int main(int argc, char ** argv) {
|
||||
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
|
||||
{
|
||||
// do not waste time on small drafts
|
||||
if (draft.size() < n_draft_min) {
|
||||
if (draft.size() < (size_t) n_draft_min) {
|
||||
draft.clear();
|
||||
}
|
||||
|
||||
@@ -181,54 +182,44 @@ int main(int argc, char ** argv) {
|
||||
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
|
||||
|
||||
n_past += ids.size() - 1;
|
||||
n_drafted += batch_tgt.n_tokens - 1;
|
||||
n_drafted += draft.size(); // note: we ignore the discarded small drafts
|
||||
n_accept += ids.size() - 1;
|
||||
n_predict += ids.size();
|
||||
|
||||
// process the accepted tokens and update contexts
|
||||
//
|
||||
// this is the standard token post-processing that we normally do
|
||||
// in this case, we do it for a group of accepted tokens at once
|
||||
//
|
||||
{
|
||||
llama_token id;
|
||||
std::string token_str;
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
prompt_tgt.push_back(id_last);
|
||||
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
id = ids[i];
|
||||
id_last = ids[i];
|
||||
|
||||
++n_predict;
|
||||
|
||||
if (llama_token_is_eog(model_tgt, id)) {
|
||||
has_eos = true;
|
||||
break;
|
||||
}
|
||||
|
||||
token_str = common_token_to_piece(ctx_tgt, id);
|
||||
|
||||
if (params.use_color && i + 1 < ids.size()) {
|
||||
LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
|
||||
} else {
|
||||
LOG("%s", token_str.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
if (llama_token_is_eog(model_tgt, id_last)) {
|
||||
has_eos = true;
|
||||
break;
|
||||
}
|
||||
|
||||
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
|
||||
const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
|
||||
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
|
||||
if (params.use_color && i + 1 < ids.size()) {
|
||||
LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
|
||||
} else {
|
||||
LOG("%s", token_str.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
prompt_tgt.push_back(id_last);
|
||||
prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1);
|
||||
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
|
||||
|
||||
// remember the last accepted token for the next iteration
|
||||
id_last = id;
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
|
||||
}
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user