mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-21 00:14:08 +00:00
Compare commits
33 Commits
b1220
...
support-st
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
92a4f86879 | ||
|
|
f82328ab65 | ||
|
|
6c353dc7c2 | ||
|
|
a1cf66ea94 | ||
|
|
101c578715 | ||
|
|
8bc76a225d | ||
|
|
ab13d071e1 | ||
|
|
4420cff654 | ||
|
|
dac31da489 | ||
|
|
0be15e162c | ||
|
|
77c7ec179c | ||
|
|
2683611944 | ||
|
|
a17ef39792 | ||
|
|
57f064d7c2 | ||
|
|
166a259f67 | ||
|
|
7298c37e7e | ||
|
|
7e0a843b6a | ||
|
|
76d32cca59 | ||
|
|
eb7f0eba3e | ||
|
|
0c5d4d87b0 | ||
|
|
98311c4277 | ||
|
|
feea179e9f | ||
|
|
769266a543 | ||
|
|
cf8238e7f4 | ||
|
|
4b8560e72a | ||
|
|
83a53b753a | ||
|
|
5c872dbca2 | ||
|
|
990a5e226a | ||
|
|
980ab41afb | ||
|
|
e394084166 | ||
|
|
4c8643dd6e | ||
|
|
35f73049af | ||
|
|
71ca2fad7d |
75
.github/workflows/build.yml
vendored
75
.github/workflows/build.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -52,7 +52,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -149,7 +149,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -174,7 +174,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -280,7 +280,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
@@ -390,20 +390,19 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.1.0', '11.7.1']
|
||||
cuda: ['12.2.0', '11.7.1']
|
||||
build: ['cublas']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- uses: Jimver/cuda-toolkit@v0.2.10
|
||||
- uses: Jimver/cuda-toolkit@v0.2.11
|
||||
id: cuda-toolkit
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
# TODO(green-sky): _dev seems to fail, and non dev are not enought
|
||||
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
|
||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -440,27 +439,11 @@ jobs:
|
||||
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ matrix.cuda == '12.1.0' }}
|
||||
# TODO(green-sky): paths are cuda 12 specific
|
||||
run: |
|
||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||
mkdir '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ matrix.cuda == '11.7.1' }}
|
||||
# TODO(green-sky): paths are cuda 11 specific
|
||||
run: |
|
||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||
mkdir '.\build\bin\cudart\'
|
||||
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
|
||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
@@ -469,6 +452,22 @@ jobs:
|
||||
path: |
|
||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
freeBSD-latest:
|
||||
runs-on: macos-12
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Build
|
||||
uses: cross-platform-actions/action@v0.19.0
|
||||
with:
|
||||
operating_system: freebsd
|
||||
version: '13.2'
|
||||
run: |
|
||||
sudo pkg update
|
||||
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
@@ -485,7 +484,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
@@ -543,7 +542,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
@@ -567,7 +566,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
@@ -591,7 +590,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
@@ -621,7 +620,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Add msbuild to PATH
|
||||
# uses: microsoft/setup-msbuild@v1
|
||||
@@ -660,7 +659,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Add msbuild to PATH
|
||||
# uses: microsoft/setup-msbuild@v1
|
||||
@@ -706,7 +705,7 @@ jobs:
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v1
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
|
||||
15
.github/workflows/docker.yml
vendored
15
.github/workflows/docker.yml
vendored
@@ -26,8 +26,15 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
|
||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||
# have disabled them for now until the reason why
|
||||
# is understood.
|
||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
@@ -51,7 +58,7 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
@@ -60,6 +67,6 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
2
.github/workflows/gguf-publish.yml
vendored
2
.github/workflows/gguf-publish.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
|
||||
@@ -172,8 +172,8 @@ if (LLAMA_METAL)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
|
||||
message(STATUS "Metal framework found")
|
||||
|
||||
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
|
||||
set(GGML_HEADERS_METAL ggml-metal.h)
|
||||
set(GGML_SOURCES_METAL ggml-metal.m)
|
||||
|
||||
add_compile_definitions(GGML_USE_METAL)
|
||||
if (LLAMA_METAL_NDEBUG)
|
||||
@@ -192,7 +192,6 @@ if (LLAMA_METAL)
|
||||
${METALKIT_FRAMEWORK}
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BLAS)
|
||||
if (LLAMA_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
@@ -269,7 +268,8 @@ if (LLAMA_BLAS)
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
||||
set(GGML_HEADERS_EXTRA k_quants.h)
|
||||
set(GGML_SOURCES_EXTRA k_quants.c)
|
||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||
if (LLAMA_QKK_64)
|
||||
add_compile_definitions(GGML_QKK_64)
|
||||
@@ -285,7 +285,8 @@ if (LLAMA_CUBLAS)
|
||||
|
||||
enable_language(CUDA)
|
||||
|
||||
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
||||
set(GGML_HEADERS_CUDA ggml-cuda.h)
|
||||
set(GGML_SOURCES_CUDA ggml-cuda.cu)
|
||||
|
||||
add_compile_definitions(GGML_USE_CUBLAS)
|
||||
# if (LLAMA_CUDA_CUBLAS)
|
||||
@@ -333,6 +334,7 @@ if (LLAMA_MPI)
|
||||
find_package(MPI)
|
||||
if (MPI_C_FOUND)
|
||||
message(STATUS "MPI found")
|
||||
set(GGML_HEADERS_MPI ggml-mpi.h)
|
||||
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
||||
add_compile_definitions(GGML_USE_MPI)
|
||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
||||
@@ -355,7 +357,8 @@ if (LLAMA_CLBLAST)
|
||||
if (CLBlast_FOUND)
|
||||
message(STATUS "CLBlast found")
|
||||
|
||||
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
|
||||
set(GGML_HEADERS_OPENCL ggml-opencl.h)
|
||||
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
|
||||
|
||||
add_compile_definitions(GGML_USE_CLBLAST)
|
||||
|
||||
@@ -383,6 +386,9 @@ if (LLAMA_HIPBLAS)
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
||||
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
@@ -631,11 +637,11 @@ add_library(ggml OBJECT
|
||||
ggml.h
|
||||
ggml-alloc.c
|
||||
ggml-alloc.h
|
||||
${GGML_SOURCES_CUDA}
|
||||
${GGML_SOURCES_OPENCL}
|
||||
${GGML_SOURCES_METAL}
|
||||
${GGML_SOURCES_MPI}
|
||||
${GGML_SOURCES_EXTRA}
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
)
|
||||
|
||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||
@@ -673,14 +679,53 @@ if (BUILD_SHARED_LIBS)
|
||||
if (LLAMA_METAL)
|
||||
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||
endif()
|
||||
install(TARGETS llama LIBRARY)
|
||||
endif()
|
||||
|
||||
|
||||
#
|
||||
# install
|
||||
#
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
|
||||
CACHE PATH "Location of header files")
|
||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
|
||||
CACHE PATH "Location of library files")
|
||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
|
||||
CACHE PATH "Location of binary files")
|
||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
|
||||
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
|
||||
LLAMA_LIB_INSTALL_DIR
|
||||
LLAMA_BIN_INSTALL_DIR )
|
||||
|
||||
write_basic_package_version_file(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||
VERSION ${LLAMA_INSTALL_VERSION}
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
install(TARGETS ggml PUBLIC_HEADER)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER llama.h)
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
install(
|
||||
FILES convert.py
|
||||
PERMISSIONS
|
||||
|
||||
32
Makefile
32
Makefile
@@ -2,7 +2,7 @@
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
|
||||
|
||||
# Code coverage output files
|
||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||
@@ -49,7 +49,7 @@ test: $(TEST_TARGETS)
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
||||
continue; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
||||
continue; \
|
||||
else \
|
||||
echo "Running test $$test_target..."; \
|
||||
@@ -110,50 +110,42 @@ MK_LDFLAGS =
|
||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||
MK_CFLAGS += -D_XOPEN_SOURCE=600
|
||||
MK_CXXFLAGS += -D_XOPEN_SOURCE=600
|
||||
MK_CPPFLAGS += -D_XOPEN_SOURCE=600
|
||||
|
||||
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||
# some string functions rely on locale_t availability,
|
||||
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||
ifeq ($(UNAME_S),OpenBSD)
|
||||
MK_CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||
MK_CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||
MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||
endif
|
||||
|
||||
# Data types, macros and functions related to controlling CPU affinity and
|
||||
# some memory allocation are available on Linux through GNU extensions in libc
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
MK_CFLAGS += -D_GNU_SOURCE
|
||||
MK_CXXFLAGS += -D_GNU_SOURCE
|
||||
MK_CPPFLAGS += -D_GNU_SOURCE
|
||||
endif
|
||||
|
||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||
# and on macOS its availability depends on enabling Darwin extensions
|
||||
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
MK_CFLAGS += -D_DARWIN_C_SOURCE
|
||||
MK_CXXFLAGS += -D_DARWIN_C_SOURCE
|
||||
MK_CPPFLAGS += -D_DARWIN_C_SOURCE
|
||||
endif
|
||||
ifeq ($(UNAME_S),DragonFly)
|
||||
MK_CFLAGS += -D__BSD_VISIBLE
|
||||
MK_CXXFLAGS += -D__BSD_VISIBLE
|
||||
MK_CPPFLAGS += -D__BSD_VISIBLE
|
||||
endif
|
||||
|
||||
# alloca is a non-standard interface that is not visible on BSDs when
|
||||
# POSIX conformance is specified, but not all of them provide a clean way
|
||||
# to enable it in such cases
|
||||
ifeq ($(UNAME_S),FreeBSD)
|
||||
MK_CFLAGS += -D__BSD_VISIBLE
|
||||
MK_CXXFLAGS += -D__BSD_VISIBLE
|
||||
MK_CPPFLAGS += -D__BSD_VISIBLE
|
||||
endif
|
||||
ifeq ($(UNAME_S),NetBSD)
|
||||
MK_CFLAGS += -D_NETBSD_SOURCE
|
||||
MK_CXXFLAGS += -D_NETBSD_SOURCE
|
||||
MK_CPPFLAGS += -D_NETBSD_SOURCE
|
||||
endif
|
||||
ifeq ($(UNAME_S),OpenBSD)
|
||||
MK_CFLAGS += -D_BSD_SOURCE
|
||||
MK_CXXFLAGS += -D_BSD_SOURCE
|
||||
MK_CPPFLAGS += -D_BSD_SOURCE
|
||||
endif
|
||||
|
||||
ifdef LLAMA_DEBUG
|
||||
@@ -182,7 +174,7 @@ MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow
|
||||
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
|
||||
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
||||
|
||||
ifeq '' '$(findstring clang++,$(CXX))'
|
||||
ifeq '' '$(findstring clang,$(shell $(CXX) --version))'
|
||||
# g++ only
|
||||
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
|
||||
endif
|
||||
@@ -605,7 +597,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-c.o: tests/test-c.c llama.h
|
||||
|
||||
13
README.md
13
README.md
@@ -844,8 +844,17 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
|
||||
#### Images
|
||||
We have two Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
Additionally, there the following images, similar to the above:
|
||||
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||
|
||||
#### Usage
|
||||
|
||||
|
||||
292
convert-baichuan-hf-to-gguf.py
Executable file
292
convert-baichuan-hf-to-gguf.py
Executable file
@@ -0,0 +1,292 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF baichuan --> gguf conversion
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
import itertools
|
||||
import gguf
|
||||
import numpy as np
|
||||
import torch
|
||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import TypeAlias
|
||||
|
||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||
|
||||
# reverse HF permute back to original pth layout
|
||||
|
||||
|
||||
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
||||
if n_kv_head is not None and n_head != n_kv_head:
|
||||
n_head //= n_kv_head
|
||||
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
||||
r = weights.shape[0] // 3
|
||||
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
||||
|
||||
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
||||
r = weights.shape[0] // 3
|
||||
return weights[r * n_part : r * n_part + r, ...]
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
num_parts = 0
|
||||
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
|
||||
return num_parts
|
||||
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
ftype = args.ftype
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||
|
||||
print("gguf: loading model "+dir_model.name)
|
||||
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
print("hello print: ",hparams["architectures"][0])
|
||||
if hparams["architectures"][0] != "BaichuanForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit()
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
print(f"num_parts:{num_parts}\n")
|
||||
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
head_count = hparams["num_attention_heads"]
|
||||
|
||||
if "num_key_value_heads" in hparams:
|
||||
head_count_kv = hparams["num_key_value_heads"]
|
||||
else:
|
||||
head_count_kv = head_count
|
||||
|
||||
if "_name_or_path" in hparams:
|
||||
hf_repo = hparams["_name_or_path"]
|
||||
else:
|
||||
hf_repo = ""
|
||||
|
||||
if "max_sequence_length" in hparams:
|
||||
ctx_length = hparams["max_sequence_length"]
|
||||
elif "max_position_embeddings" in hparams:
|
||||
ctx_length = hparams["max_position_embeddings"]
|
||||
elif "model_max_length" in hparams:
|
||||
ctx_length = hparams["model_max_length"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
|
||||
sys.exit()
|
||||
|
||||
|
||||
gguf_writer.add_name(dir_model.name)
|
||||
gguf_writer.add_source_hf_repo(hf_repo)
|
||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
gguf_writer.add_context_length(ctx_length)
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
gguf_writer.add_head_count(head_count)
|
||||
gguf_writer.add_head_count_kv(head_count_kv)
|
||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
|
||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||
if "type" in hparams["rope_scaling"]:
|
||||
if hparams["rope_scaling"]["type"] == "linear":
|
||||
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
||||
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytes] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_model_file = dir_model / 'tokenizer.model'
|
||||
if not tokenizer_model_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# vocab type sentencepiece
|
||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
||||
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
text: bytes
|
||||
score: float
|
||||
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(i)
|
||||
|
||||
toktype = 1 # defualt to normal token type
|
||||
if tokenizer.is_unknown(i):
|
||||
toktype = 2
|
||||
if tokenizer.is_control(i):
|
||||
toktype = 3
|
||||
|
||||
# toktype = 4 is user-defined = tokens from added_tokens.json
|
||||
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = 5
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = 6
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
added_tokens_file = dir_model / 'added_tokens.json'
|
||||
if added_tokens_file.is_file():
|
||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||
addtokens_json = json.load(f)
|
||||
|
||||
print("gguf: get added tokens")
|
||||
|
||||
for key in addtokens_json:
|
||||
tokens.append( key.encode("utf-8") )
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(4) # user-defined token type
|
||||
|
||||
|
||||
gguf_writer.add_tokenizer_model("llama")
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
|
||||
for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
|
||||
tmp=model_part
|
||||
for i in range(block_count):
|
||||
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
||||
print(f"Unpacking and permuting layer {i}")
|
||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
||||
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
if not args.vocab_only:
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||
print("")
|
||||
@@ -137,7 +137,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
267
convert-starcoder-hf-to-gguf.py
Executable file
267
convert-starcoder-hf-to-gguf.py
Executable file
@@ -0,0 +1,267 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF starcoder --> gguf conversion
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer # type: ignore[import]
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
return num_parts
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
ftype = args.ftype
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||
|
||||
print("gguf: loading model "+dir_model.name)
|
||||
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
|
||||
ARCH=gguf.MODEL_ARCH.STARCODER
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["n_layer"]
|
||||
|
||||
gguf_writer.add_name("StarCoder")
|
||||
gguf_writer.add_context_length(2048) # not in config.json
|
||||
gguf_writer.add_embedding_length(hparams["n_embd"])
|
||||
gguf_writer.add_max_position_embeddings(hparams["n_positions"])
|
||||
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["n_head"])
|
||||
gguf_writer.add_head_count_kv(hparams["n_head"])
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
gguf_writer.add_file_type(ftype)
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytearray] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# params for qkv transform
|
||||
n_head = hparams["n_head"]
|
||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||
|
||||
head_dim = hparams["n_embd"] // n_head
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
if name.endswith(".attn.c_attn.weight") or name.endswith(".attn.c_attn.bias"):
|
||||
print("Duplicate K,V heads to use MHA instead of MQA for", name)
|
||||
|
||||
embed_dim = hparams["n_embd"]
|
||||
head_dim = embed_dim // hparams["n_head"]
|
||||
|
||||
# ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
|
||||
q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0)
|
||||
# duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
|
||||
if len(k.shape) == 2:
|
||||
k = np.tile(k, (hparams["n_head"], 1))
|
||||
v = np.tile(v, (hparams["n_head"], 1))
|
||||
elif len(k.shape) == 1:
|
||||
k = np.tile(k, (hparams["n_head"]))
|
||||
v = np.tile(v, (hparams["n_head"]))
|
||||
# concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
|
||||
data = np.concatenate((q, k, v), axis=0)
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
if not args.vocab_only:
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||
print("")
|
||||
51
examples/main-cmake-pkg/.gitignore
vendored
Normal file
51
examples/main-cmake-pkg/.gitignore
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
# Prerequisites
|
||||
*.d
|
||||
|
||||
# Compiled Object files
|
||||
*.slo
|
||||
*.lo
|
||||
*.o
|
||||
*.obj
|
||||
|
||||
# Precompiled Headers
|
||||
*.gch
|
||||
*.pch
|
||||
|
||||
# Compiled Dynamic libraries
|
||||
*.so
|
||||
*.dylib
|
||||
*.dll
|
||||
|
||||
# Fortran module files
|
||||
*.mod
|
||||
*.smod
|
||||
|
||||
# Compiled Static libraries
|
||||
*.lai
|
||||
*.la
|
||||
*.a
|
||||
*.lib
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
*.out
|
||||
*.app
|
||||
|
||||
*.gguf
|
||||
|
||||
*.log
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
.direnv/
|
||||
.envrc
|
||||
.swiftpm
|
||||
.venv
|
||||
.clang-tidy
|
||||
.vs/
|
||||
.vscode/
|
||||
|
||||
build*/
|
||||
out/
|
||||
tmp/
|
||||
|
||||
36
examples/main-cmake-pkg/CMakeLists.txt
Normal file
36
examples/main-cmake-pkg/CMakeLists.txt
Normal file
@@ -0,0 +1,36 @@
|
||||
cmake_minimum_required(VERSION 3.12)
|
||||
project("main-cmake-pkg" C CXX)
|
||||
set(TARGET main-cmake-pkg)
|
||||
|
||||
find_package(Llama 0.0.1 REQUIRED)
|
||||
|
||||
# Bake common functionality in with target. Because applications
|
||||
# using the relocatable Llama package should be outside of the
|
||||
# source tree, main-cmake-pkg pretends the dependencies are built-in.
|
||||
|
||||
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
||||
add_library(common OBJECT
|
||||
${_common_path}/common.h
|
||||
${_common_path}/common.cpp
|
||||
${_common_path}/console.h
|
||||
${_common_path}/console.cpp
|
||||
${_common_path}/grammar-parser.h
|
||||
${_common_path}/grammar-parser.cpp
|
||||
)
|
||||
|
||||
# WARNING: because build-info.h is auto-generated, it will only
|
||||
# be available after the user has built the llama.cpp sources.
|
||||
#
|
||||
configure_file(${_common_path}/../build-info.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/build-info.h
|
||||
COPYONLY)
|
||||
|
||||
target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
|
||||
${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
|
||||
target_include_directories(${TARGET} PRIVATE ${_common_path})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
37
examples/main-cmake-pkg/README.md
Normal file
37
examples/main-cmake-pkg/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# llama.cpp/example/main-cmake-pkg
|
||||
|
||||
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||
|
||||
## Building
|
||||
|
||||
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
|
||||
|
||||
### Considerations
|
||||
|
||||
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
||||
|
||||
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||
|
||||
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
|
||||
|
||||
```cmd
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build . --config Release
|
||||
cmake --install . --prefix C:/LlamaCPP
|
||||
```
|
||||
|
||||
### Build main-cmake-pkg
|
||||
|
||||
|
||||
```cmd
|
||||
cd ..\examples\main-cmake-pkg
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build . --config Release
|
||||
cmake --install . --prefix C:/MyLlamaApp
|
||||
```
|
||||
@@ -82,7 +82,7 @@ int main(int argc, char ** argv) {
|
||||
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
|
||||
|
||||
// how many tokens to draft each time
|
||||
const int n_draft = params.n_draft;
|
||||
int n_draft = params.n_draft;
|
||||
|
||||
int n_predict = 0;
|
||||
int n_drafted = 0;
|
||||
@@ -131,6 +131,7 @@ int main(int argc, char ** argv) {
|
||||
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
|
||||
|
||||
int i_dft = 0;
|
||||
|
||||
while (true) {
|
||||
// sample from the target model
|
||||
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
||||
@@ -174,6 +175,27 @@ int main(int argc, char ** argv) {
|
||||
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
|
||||
++n_past_dft;
|
||||
|
||||
// heuristic for n_draft
|
||||
{
|
||||
const int n_draft_cur = (int) drafted.size();
|
||||
const bool all_accepted = i_dft == n_draft_cur;
|
||||
|
||||
LOG("n_draft = %d\n", n_draft);
|
||||
LOG("n_draft_cur = %d\n", n_draft_cur);
|
||||
LOG("i_dft = %d\n", i_dft);
|
||||
LOG("all_accepted = %d\n", all_accepted);
|
||||
|
||||
if (all_accepted && n_draft == n_draft_cur) {
|
||||
LOG(" - max drafted tokens accepted - n_draft += 8\n");
|
||||
n_draft = std::min(30, n_draft + 8);
|
||||
} else if (all_accepted) {
|
||||
LOG(" - partially drafted tokens accepted - no change\n");
|
||||
} else {
|
||||
LOG(" - drafted token rejected - n_draft -= 1\n");
|
||||
n_draft = std::max(2, n_draft - 1);
|
||||
}
|
||||
}
|
||||
|
||||
drafted.clear();
|
||||
drafted.push_back(id);
|
||||
|
||||
|
||||
@@ -45,6 +45,8 @@
|
||||
postInstall = ''
|
||||
mv $out/bin/main $out/bin/llama
|
||||
mv $out/bin/server $out/bin/llama-server
|
||||
mkdir -p $out/include
|
||||
cp ${src}/llama.h $out/include/
|
||||
'';
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||
in
|
||||
|
||||
@@ -118,7 +118,7 @@ kernel void kernel_soft_max(
|
||||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
// parallel max
|
||||
float lmax = psrc0[tpitg[0]];
|
||||
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
|
||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
|
||||
lmax = MAX(lmax, psrc0[i00]);
|
||||
}
|
||||
@@ -158,7 +158,7 @@ kernel void kernel_soft_max_4(
|
||||
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
|
||||
// parallel max
|
||||
float4 lmax4 = psrc4[tpitg[0]];
|
||||
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
|
||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]);
|
||||
}
|
||||
|
||||
@@ -36,12 +36,13 @@ KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
||||
KEY_GENERAL_FILE_TYPE = "general.file_type"
|
||||
|
||||
# LLM
|
||||
KEY_CONTEXT_LENGTH = "{arch}.context_length"
|
||||
KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
KEY_BLOCK_COUNT = "{arch}.block_count"
|
||||
KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
KEY_CONTEXT_LENGTH = "{arch}.context_length"
|
||||
KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
KEY_BLOCK_COUNT = "{arch}.block_count"
|
||||
KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
KEY_MAX_POSITION_EMBEDDINGS = "{arch}.max_position_embeddings"
|
||||
|
||||
# attention
|
||||
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -77,12 +78,14 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
||||
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
LLAMA : int = auto()
|
||||
FALCON : int = auto()
|
||||
GPT2 : int = auto()
|
||||
GPTJ : int = auto()
|
||||
GPTNEOX: int = auto()
|
||||
MPT : int = auto()
|
||||
LLAMA : int = auto()
|
||||
FALCON : int = auto()
|
||||
BAICHUAN : int = auto()
|
||||
GPT2 : int = auto()
|
||||
GPTJ : int = auto()
|
||||
GPTNEOX : int = auto()
|
||||
MPT : int = auto()
|
||||
STARCODER : int = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -106,12 +109,14 @@ class MODEL_TENSOR(IntEnum):
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.FALCON: "falcon",
|
||||
MODEL_ARCH.GPT2: "gpt2",
|
||||
MODEL_ARCH.GPTJ: "gptj",
|
||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.FALCON: "falcon",
|
||||
MODEL_ARCH.BAICHUAN: "baichuan",
|
||||
MODEL_ARCH.GPT2: "gpt2",
|
||||
MODEL_ARCH.GPTJ: "gptj",
|
||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
MODEL_ARCH.STARCODER: "starcoder",
|
||||
}
|
||||
|
||||
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
||||
@@ -153,6 +158,34 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.BAICHUAN: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.STARCODER: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.GPT2: {
|
||||
# TODO
|
||||
},
|
||||
@@ -165,6 +198,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.BAICHUAN: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -187,7 +224,7 @@ class TensorNameMap:
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
||||
"output", # llama-pth
|
||||
),
|
||||
|
||||
@@ -195,7 +232,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 falcon
|
||||
"model.norm", # llama-hf
|
||||
"model.norm", # llama-hf baichuan
|
||||
"norm", # llama-pth
|
||||
),
|
||||
|
||||
@@ -311,6 +348,7 @@ class TensorNameMap:
|
||||
tensor_name = tensor_names.get(tensor)
|
||||
if tensor_name is None:
|
||||
continue
|
||||
mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
mapping[key] = (tensor, tensor_name)
|
||||
for bid in range(n_blocks):
|
||||
@@ -319,11 +357,12 @@ class TensorNameMap:
|
||||
if tensor_name is None:
|
||||
continue
|
||||
tensor_name = tensor_name.format(bid = bid)
|
||||
mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
key = key.format(bid = bid)
|
||||
mapping[key] = (tensor, tensor_name)
|
||||
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||
result = self.mapping.get(key)
|
||||
if result is not None:
|
||||
return result
|
||||
@@ -334,13 +373,13 @@ class TensorNameMap:
|
||||
return (result[0], result[1] + suffix)
|
||||
return None
|
||||
|
||||
def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
|
||||
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
|
||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||
if result is None:
|
||||
return None
|
||||
return result[1]
|
||||
|
||||
def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
|
||||
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
|
||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||
if result is None:
|
||||
return None
|
||||
@@ -679,6 +718,10 @@ class GGUFWriter:
|
||||
self.add_uint32(
|
||||
KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_max_position_embeddings(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_MAX_POSITION_EMBEDDINGS.format(arch=self.arch), length)
|
||||
|
||||
def add_block_count(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_BLOCK_COUNT.format(arch=self.arch), length)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.3.2"
|
||||
version = "0.3.3"
|
||||
description = "Write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
||||
4
prompts/chat-with-baichuan.txt
Normal file
4
prompts/chat-with-baichuan.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
以下内容为人类用户与与一位智能助手的对话。
|
||||
|
||||
用户:你好!
|
||||
助手:
|
||||
69
scripts/LlamaConfig.cmake.in
Normal file
69
scripts/LlamaConfig.cmake.in
Normal file
@@ -0,0 +1,69 @@
|
||||
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
set(LLAMA_BLAS @LLAMA_BLAS@)
|
||||
set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
|
||||
set(LLAMA_METAL @LLAMA_METAL@)
|
||||
set(LLAMA_MPI @LLAMA_MPI@)
|
||||
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
||||
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
||||
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||
|
||||
# Ensure transient dependencies satisfied
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
if (APPLE AND LLAMA_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BLAS)
|
||||
find_package(BLAS REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_MPI)
|
||||
find_package(MPI REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_HIPBLAS)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
endif()
|
||||
|
||||
find_library(llama_LIBRARY llama
|
||||
REQUIRED
|
||||
HINTS ${LLAMA_LIB_DIR})
|
||||
|
||||
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
||||
add_library(llama UNKNOWN IMPORTED)
|
||||
set_target_properties(llama
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
||||
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||
POSITION_INDEPENDENT_CODE ON )
|
||||
|
||||
check_required_components(Llama)
|
||||
@@ -29,9 +29,8 @@ llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1.cpp)
|
||||
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
||||
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
@@ -89,6 +90,12 @@ int main(int argc, char **argv) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// We need this for unicode console support
|
||||
console::init(false, false);
|
||||
atexit([]() { console::cleanup(); });
|
||||
#endif
|
||||
|
||||
bool success = true;
|
||||
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
|
||||
127
tests/test-tokenizer-1-llama.cpp
Normal file
127
tests/test-tokenizer-1-llama.cpp
Normal file
@@ -0,0 +1,127 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <codecvt>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
|
||||
typedef int codepoint;
|
||||
|
||||
std::string codepoint_to_utf8(codepoint cp) {
|
||||
std::string result;
|
||||
if (0x00 <= cp && cp <= 0x7f) {
|
||||
result.push_back(cp);
|
||||
} else if (0x80 <= cp && cp <= 0x7ff) {
|
||||
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
} else if (0x800 <= cp && cp <= 0xffff) {
|
||||
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
} else if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
||||
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
} else {
|
||||
throw std::invalid_argument("invalid codepoint");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init(false);
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM);
|
||||
|
||||
#ifdef _WIN32
|
||||
// We need this for unicode console support
|
||||
console::init(false, false);
|
||||
atexit([]() { console::cleanup(); });
|
||||
#endif
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (check != str) {
|
||||
fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n",
|
||||
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
||||
if(i != 3)
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
|
||||
if (cp < 0xd800 || cp > 0xdfff) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
if(cp != 0 && cp != 9601)
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,108 +0,0 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <codecvt>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
|
||||
static std::string escape_whitespace(const std::string& text) {
|
||||
std::string result = "\xe2\x96\x81";
|
||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||
if (text[offs] == ' ') {
|
||||
result += "\xe2\x96\x81";
|
||||
} else {
|
||||
result += text[offs];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init(false);
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx = llama_new_context_with_model(model, lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string forward = llama_token_to_piece(ctx, i);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
||||
if (tokens.size() == 1) {
|
||||
if (i != tokens[0]) {
|
||||
std::string backward = llama_token_to_piece(ctx, tokens[0]);
|
||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
||||
__func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
|
||||
for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
|
||||
std::u16string u16str(1, ch);
|
||||
std::string str = u16converter.to_bytes(u16str);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
|
||||
if (tokens.size() == 1) {
|
||||
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
||||
__func__, str.c_str(), tokens[0]);
|
||||
}
|
||||
}
|
||||
|
||||
std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
|
||||
for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
|
||||
std::u32string u32str(1, ch);
|
||||
std::string str = u32converter.to_bytes(u32str);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
|
||||
if (tokens.size() == 1) {
|
||||
fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user