llama : make starcoder graph build more consistent with others

metal : fix out-of-bounds access in soft_max kernels
cleanup useless code
2026-05-21 00:14:08 +00:00 · 2023-09-15 17:57:10 +03:00 · 2023-09-15 17:56:49 +03:00 · 2023-09-15 19:00:14 +08:00 · 2023-09-15 18:45:43 +08:00 · 2023-09-15 15:23:50 +08:00
24 changed files with 2095 additions and 353 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -52,7 +52,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -87,7 +87,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -121,7 +121,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -149,7 +149,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -174,7 +174,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -280,7 +280,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Download OpenCL SDK
        id: get_opencl
@@ -390,20 +390,19 @@ jobs:

    strategy:
      matrix:
-        cuda: ['12.1.0', '11.7.1']
+        cuda: ['12.2.0', '11.7.1']
        build: ['cublas']

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

-      - uses: Jimver/cuda-toolkit@v0.2.10
+      - uses: Jimver/cuda-toolkit@v0.2.11
        id: cuda-toolkit
        with:
          cuda: ${{ matrix.cuda }}
-          # TODO(green-sky): _dev seems to fail, and non dev are not enought
-          #sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
+          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

      - name: Build
        id: cmake_build
@@ -440,27 +439,11 @@ jobs:
            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
-        if: ${{ matrix.cuda == '12.1.0' }}
-        # TODO(green-sky): paths are cuda 12 specific
        run: |
          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
-          mkdir '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
-
-      - name: Copy and pack Cuda runtime
-        if: ${{ matrix.cuda == '11.7.1' }}
-        # TODO(green-sky): paths are cuda 11 specific
-        run: |
-          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
-          mkdir '.\build\bin\cudart\'
-          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
+          $dst='.\build\bin\cudart\'
+          robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*

      - name: Upload Cuda runtime
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -469,6 +452,22 @@ jobs:
          path: |
            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip

+  freeBSD-latest:
+    runs-on: macos-12
+    steps:
+    - name: Clone
+      uses: actions/checkout@v3
+
+    - name: Build
+      uses: cross-platform-actions/action@v0.19.0
+      with:
+        operating_system: freebsd
+        version: '13.2'
+        run: |
+            sudo pkg update
+            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
+            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
+
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@@ -485,7 +484,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3

      - name: Determine tag name
        id: tag
@@ -543,7 +542,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v1
+#        uses: actions/checkout@v3
 #
 #      - name: Dependencies
 #        run: |
@@ -567,7 +566,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v1
+#        uses: actions/checkout@v3
 #
 #      - name: Dependencies
 #        run: |
@@ -591,7 +590,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v1
+#        uses: actions/checkout@v3
 #
 #      - name: Dependencies
 #        run: |
@@ -621,7 +620,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v1
+#        uses: actions/checkout@v3
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -660,7 +659,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v1
+#        uses: actions/checkout@v3
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -706,7 +705,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v1
+#        uses: actions/checkout@v3
 #
 #      - name: Dependencies
 #        run: |
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -26,8 +26,15 @@ jobs:
    strategy:
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/main.Dockerfile" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile" }
+          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
+          #                     have disabled them for now until the reason why
+          #                     is understood.
+          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
@@ -51,7 +58,7 @@ jobs:
        with:
          context: .
          push: true
-          platforms: linux/amd64,linux/arm64
+          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

@@ -60,6 +67,6 @@ jobs:
        with:
          context: .
          push: ${{ github.event_name == 'push' }}
-          platforms: linux/amd64,linux/arm64
+          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -24,7 +24,7 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
    - name: Set up Python
      uses: actions/setup-python@v2
      with:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,8 +172,8 @@ if (LLAMA_METAL)
    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)

    message(STATUS "Metal framework found")
-
-    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
+    set(GGML_HEADERS_METAL ggml-metal.h)
+    set(GGML_SOURCES_METAL ggml-metal.m)

    add_compile_definitions(GGML_USE_METAL)
    if (LLAMA_METAL_NDEBUG)
@@ -192,7 +192,6 @@ if (LLAMA_METAL)
        ${METALKIT_FRAMEWORK}
        )
 endif()
-
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@@ -269,7 +268,8 @@ if (LLAMA_BLAS)
 endif()

 if (LLAMA_K_QUANTS)
-    set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
+    set(GGML_HEADERS_EXTRA k_quants.h)
+    set(GGML_SOURCES_EXTRA k_quants.c)
    add_compile_definitions(GGML_USE_K_QUANTS)
    if (LLAMA_QKK_64)
        add_compile_definitions(GGML_QKK_64)
@@ -285,7 +285,8 @@ if (LLAMA_CUBLAS)

        enable_language(CUDA)

-        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
+        set(GGML_HEADERS_CUDA ggml-cuda.h)
+        set(GGML_SOURCES_CUDA ggml-cuda.cu)

        add_compile_definitions(GGML_USE_CUBLAS)
 #        if (LLAMA_CUDA_CUBLAS)
@@ -333,6 +334,7 @@ if (LLAMA_MPI)
    find_package(MPI)
    if (MPI_C_FOUND)
        message(STATUS "MPI found")
+        set(GGML_HEADERS_MPI ggml-mpi.h)
        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
@@ -355,7 +357,8 @@ if (LLAMA_CLBLAST)
    if (CLBlast_FOUND)
        message(STATUS "CLBlast found")

-        set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
+        set(GGML_HEADERS_OPENCL ggml-opencl.h)
+        set(GGML_SOURCES_OPENCL ggml-opencl.cpp)

        add_compile_definitions(GGML_USE_CLBLAST)

@@ -383,6 +386,9 @@ if (LLAMA_HIPBLAS)
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
+        if (BUILD_SHARED_LIBS)
+            set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        endif()
        if (LLAMA_CUDA_FORCE_DMMV)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
        endif()
@@ -631,11 +637,11 @@ add_library(ggml OBJECT
            ggml.h
            ggml-alloc.c
            ggml-alloc.h
-            ${GGML_SOURCES_CUDA}
-            ${GGML_SOURCES_OPENCL}
-            ${GGML_SOURCES_METAL}
-            ${GGML_SOURCES_MPI}
-            ${GGML_SOURCES_EXTRA}
+            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
+            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
@@ -673,14 +679,53 @@ if (BUILD_SHARED_LIBS)
    if (LLAMA_METAL)
        set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
    endif()
-    install(TARGETS llama LIBRARY)
 endif()

+
 #
 # install
 #

 include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
+    CACHE PATH "Location of header files")
+set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
+    CACHE PATH "Location of library files")
+set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
+    CACHE PATH "Location of binary files")
+set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
+set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
+set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
+
+configure_package_config_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
+    PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
+              LLAMA_LIB_INSTALL_DIR
+              LLAMA_BIN_INSTALL_DIR )
+
+write_basic_package_version_file(
+        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
+    VERSION ${LLAMA_INSTALL_VERSION}
+    COMPATIBILITY SameMajorVersion)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
+
+set(GGML_PUBLIC_HEADERS "ggml.h"
+        "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
+        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
+
+set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
+install(TARGETS ggml PUBLIC_HEADER)
+
+set_target_properties(llama PROPERTIES PUBLIC_HEADER llama.h)
+install(TARGETS llama LIBRARY PUBLIC_HEADER)
+
 install(
    FILES convert.py
    PERMISSIONS
--- a/32
+++ b/32
@@ -2,7 +2,7 @@
 BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o

 # Binaries only useful for tests
-TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -49,7 +49,7 @@ test: $(TEST_TARGETS)
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
 			continue; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
 			continue; \
 		else \
 			echo "Running test $$test_target..."; \
@@ -110,50 +110,42 @@ MK_LDFLAGS  =
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
 # posix_memalign came in POSIX.1-2001 / SUSv3
 # M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-MK_CFLAGS   += -D_XOPEN_SOURCE=600
-MK_CXXFLAGS += -D_XOPEN_SOURCE=600
+MK_CPPFLAGS += -D_XOPEN_SOURCE=600

 # Somehow in OpenBSD whenever POSIX conformance is specified
 # some string functions rely on locale_t availability,
 # which was introduced in POSIX.1-2008, forcing us to go higher
 ifeq ($(UNAME_S),OpenBSD)
-	MK_CFLAGS   += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
-	MK_CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
+	MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
 endif

 # Data types, macros and functions related to controlling CPU affinity and
 # some memory allocation are available on Linux through GNU extensions in libc
 ifeq ($(UNAME_S),Linux)
-	MK_CFLAGS   += -D_GNU_SOURCE
-	MK_CXXFLAGS += -D_GNU_SOURCE
+	MK_CPPFLAGS += -D_GNU_SOURCE
 endif

 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
 # and on macOS its availability depends on enabling Darwin extensions
 # similarly on DragonFly, enabling BSD extensions is necessary
 ifeq ($(UNAME_S),Darwin)
-	MK_CFLAGS   += -D_DARWIN_C_SOURCE
-	MK_CXXFLAGS += -D_DARWIN_C_SOURCE
+	MK_CPPFLAGS += -D_DARWIN_C_SOURCE
 endif
 ifeq ($(UNAME_S),DragonFly)
-	MK_CFLAGS   += -D__BSD_VISIBLE
-	MK_CXXFLAGS += -D__BSD_VISIBLE
+	MK_CPPFLAGS += -D__BSD_VISIBLE
 endif

 # alloca is a non-standard interface that is not visible on BSDs when
 # POSIX conformance is specified, but not all of them provide a clean way
 # to enable it in such cases
 ifeq ($(UNAME_S),FreeBSD)
-	MK_CFLAGS   += -D__BSD_VISIBLE
-	MK_CXXFLAGS += -D__BSD_VISIBLE
+	MK_CPPFLAGS += -D__BSD_VISIBLE
 endif
 ifeq ($(UNAME_S),NetBSD)
-	MK_CFLAGS   += -D_NETBSD_SOURCE
-	MK_CXXFLAGS += -D_NETBSD_SOURCE
+	MK_CPPFLAGS += -D_NETBSD_SOURCE
 endif
 ifeq ($(UNAME_S),OpenBSD)
-	MK_CFLAGS   += -D_BSD_SOURCE
-	MK_CXXFLAGS += -D_BSD_SOURCE
+	MK_CPPFLAGS += -D_BSD_SOURCE
 endif

 ifdef LLAMA_DEBUG
@@ -182,7 +174,7 @@ MK_CFLAGS    += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow
 				-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
 MK_CXXFLAGS  += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

-ifeq '' '$(findstring clang++,$(CXX))'
+ifeq '' '$(findstring clang,$(shell $(CXX) --version))'
 	# g++ only
 	MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
 endif
@@ -605,7 +597,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
 tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-c.o: tests/test-c.c llama.h
--- a/README.md
+++ b/README.md
@@ -844,8 +844,17 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
 #### Images
 We have two Docker images available for this project:

-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
+1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+
+Additionally, there the following images, similar to the above:
+
+- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).

 #### Usage

--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+# HF baichuan --> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+import itertools
+import gguf
+import numpy as np
+import torch
+from sentencepiece import SentencePieceProcessor  # type: ignore[import]
+
+
+if TYPE_CHECKING:
+    from typing import TypeAlias
+
+NDArray: TypeAlias = 'np.ndarray[Any, Any]'
+
+# reverse HF permute back to original pth layout
+
+
+def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
+    if n_kv_head is not None and n_head != n_kv_head:
+        n_head //= n_kv_head
+
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape))
+
+def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
+        r = weights.shape[0] // 3
+        return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
+
+def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
+        r = weights.shape[0] // 3
+        return weights[r * n_part : r * n_part + r, ...]
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+
+    return num_parts
+
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+print("hello print: ",hparams["architectures"][0])
+if hparams["architectures"][0] != "BaichuanForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+print(f"num_parts:{num_parts}\n")
+ARCH=gguf.MODEL_ARCH.BAICHUAN
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+elif "model_max_length" in hparams:
+    ctx_length = hparams["model_max_length"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(dir_model.name)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytes] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+tokenizer_model_file = dir_model / 'tokenizer.model'
+if not tokenizer_model_file.is_file():
+    print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
+    sys.exit(1)
+
+# vocab type sentencepiece
+print("gguf: get sentencepiece tokenizer vocab, scores and token types")
+
+tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
+
+for i in range(tokenizer.vocab_size()):
+    text: bytes
+    score: float
+
+    piece = tokenizer.id_to_piece(i)
+    text = piece.encode("utf-8")
+    score = tokenizer.get_score(i)
+
+    toktype = 1  # defualt to normal token type
+    if tokenizer.is_unknown(i):
+        toktype = 2
+    if tokenizer.is_control(i):
+        toktype = 3
+
+    # toktype = 4 is user-defined = tokens from added_tokens.json
+
+    if tokenizer.is_unused(i):
+        toktype = 5
+    if tokenizer.is_byte(i):
+        toktype = 6
+
+    tokens.append(text)
+    scores.append(score)
+    toktypes.append(toktype)
+
+added_tokens_file = dir_model / 'added_tokens.json'
+if added_tokens_file.is_file():
+    with open(added_tokens_file, "r", encoding="utf-8") as f:
+        addtokens_json = json.load(f)
+
+        print("gguf: get added tokens")
+
+        for key in addtokens_json:
+            tokens.append( key.encode("utf-8") )
+            scores.append(-1000.0)
+            toktypes.append(4) # user-defined token type
+
+
+gguf_writer.add_tokenizer_model("llama")
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model)
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    tmp=model_part
+    for i in range(block_count):
+        if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
+            print(f"Unpacking and permuting layer {i}")
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
+            tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
+            tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
+            del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
+
+    for name in model_part.keys():
+        data = model_part[name]
+        # we don't need these
+        if name.endswith(".rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + " -> " +  new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        gguf_writer.add_tensor(new_name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -137,7 +137,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f:

 print("gguf: get gpt2 tokenizer vocab")

-vocab_size = len(tokenizer_json["model"]["vocab"])
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])

 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+# HF starcoder --> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer  # type: ignore[import]
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
+def bytes_to_unicode():
+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    return dict(zip(bs, (chr(n) for n in cs)))
+
+
+def count_model_parts(dir_model: Path) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
+    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
+    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit(1)
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.STARCODER
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layer"]
+
+gguf_writer.add_name("StarCoder")
+gguf_writer.add_context_length(2048) # not in config.json
+gguf_writer.add_embedding_length(hparams["n_embd"])
+gguf_writer.add_max_position_embeddings(hparams["n_positions"])
+gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(hparams["n_head"])
+gguf_writer.add_head_count_kv(hparams["n_head"])
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
+gguf_writer.add_file_type(ftype)
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytearray] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+tokenizer_json_file = dir_model / 'tokenizer.json'
+if not tokenizer_json_file.is_file():
+    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
+    sys.exit(1)
+
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
+
+with open(tokenizer_json_file, "r", encoding="utf-8") as f:
+    tokenizer_json = json.load(f)
+
+print("gguf: get gpt2 tokenizer vocab")
+
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
+
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+for i in range(vocab_size):
+    if i in reverse_vocab:
+        try:
+            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+        except KeyError:
+            text = bytearray()
+            for c in reverse_vocab[i]:
+                if ord(c) < 256:  # single byte character
+                    text.append(byte_decoder[ord(c)])
+                else:  # multibyte special token character
+                    text.extend(c.encode('utf-8'))
+    else:
+        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+        pad_token = f"[PAD{i}]".encode("utf8")
+        text = bytearray(pad_token)
+
+    tokens.append(text)
+    scores.append(0.0)                      # dymmy
+    toktypes.append(gguf.TokenType.NORMAL)  # dummy
+
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# params for qkv transform
+n_head    = hparams["n_head"]
+n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
+
+head_dim = hparams["n_embd"] // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(dir_model / part_name, map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        if name.endswith(".attn.c_attn.weight") or name.endswith(".attn.c_attn.bias"):
+            print("Duplicate K,V heads to use MHA instead of MQA for", name)
+
+            embed_dim = hparams["n_embd"]
+            head_dim = embed_dim // hparams["n_head"]
+
+            # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
+            q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0)
+            # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
+            if len(k.shape) == 2:
+                k = np.tile(k, (hparams["n_head"], 1))
+                v = np.tile(v, (hparams["n_head"], 1))
+            elif len(k.shape) == 1:
+                k = np.tile(k, (hparams["n_head"]))
+                v = np.tile(v, (hparams["n_head"]))
+            # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
+            data = np.concatenate((q, k, v), axis=0)
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(new_name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
--- a/examples/main-cmake-pkg/.gitignore
+++ b/examples/main-cmake-pkg/.gitignore
@@ -0,0 +1,51 @@
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+*.gguf
+
+*.log
+.DS_Store
+.build/
+.cache/
+.direnv/
+.envrc
+.swiftpm
+.venv
+.clang-tidy
+.vs/
+.vscode/
+
+build*/
+out/
+tmp/
+
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 3.12)
+project("main-cmake-pkg" C CXX)
+set(TARGET main-cmake-pkg)
+
+find_package(Llama 0.0.1 REQUIRED)
+
+# Bake common functionality in with target. Because applications
+# using the relocatable Llama package should be outside of the
+# source tree, main-cmake-pkg pretends the dependencies are built-in.
+
+set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
+add_library(common OBJECT
+    ${_common_path}/common.h
+    ${_common_path}/common.cpp
+    ${_common_path}/console.h
+    ${_common_path}/console.cpp
+    ${_common_path}/grammar-parser.h
+    ${_common_path}/grammar-parser.cpp
+    )
+
+# WARNING: because build-info.h is auto-generated, it will only
+# be available after the user has built the llama.cpp sources.
+#
+configure_file(${_common_path}/../build-info.h
+    ${CMAKE_CURRENT_BINARY_DIR}/build-info.h
+    COPYONLY)
+
+target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR})
+
+add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
+target_include_directories(${TARGET} PRIVATE ${_common_path})
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -0,0 +1,37 @@
+# llama.cpp/example/main-cmake-pkg
+
+This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
+
+## Building
+
+Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
+
+### Considerations
+
+When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+
+### Build llama.cpp and install to C:\LlamaCPP directory
+
+In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
+
+```cmd
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+mkdir build
+cd build
+cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
+cmake --build . --config Release
+cmake --install . --prefix C:/LlamaCPP
+```
+
+### Build main-cmake-pkg
+
+
+```cmd
+cd ..\examples\main-cmake-pkg
+mkdir build
+cd build
+cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake --build . --config Release
+cmake --install . --prefix C:/MyLlamaApp
+```
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -82,7 +82,7 @@ int main(int argc, char ** argv) {
    //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));

    // how many tokens to draft each time
-    const int n_draft = params.n_draft;
+    int n_draft = params.n_draft;

    int n_predict = 0;
    int n_drafted = 0;
@@ -131,6 +131,7 @@ int main(int argc, char ** argv) {
        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));

        int i_dft = 0;
+
        while (true) {
            // sample from the target model
            const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
@@ -174,6 +175,27 @@ int main(int argc, char ** argv) {
            llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
            ++n_past_dft;

+            // heuristic for n_draft
+            {
+                const int  n_draft_cur  = (int) drafted.size();
+                const bool all_accepted = i_dft == n_draft_cur;
+
+                LOG("n_draft      = %d\n", n_draft);
+                LOG("n_draft_cur  = %d\n", n_draft_cur);
+                LOG("i_dft        = %d\n", i_dft);
+                LOG("all_accepted = %d\n", all_accepted);
+
+                if (all_accepted && n_draft == n_draft_cur) {
+                    LOG(" - max drafted tokens accepted - n_draft += 8\n");
+                    n_draft = std::min(30, n_draft + 8);
+                } else if (all_accepted) {
+                    LOG(" - partially drafted tokens accepted - no change\n");
+                } else {
+                    LOG(" - drafted token rejected - n_draft -= 1\n");
+                    n_draft = std::max(2, n_draft - 1);
+                }
+            }
+
            drafted.clear();
            drafted.push_back(id);

--- a/flake.nix
+++ b/flake.nix
@@ -45,6 +45,8 @@
        postInstall = ''
          mv $out/bin/main $out/bin/llama
          mv $out/bin/server $out/bin/llama-server
+          mkdir -p $out/include
+          cp ${src}/llama.h $out/include/
        '';
        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
      in
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -118,7 +118,7 @@ kernel void kernel_soft_max(
    device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;

    // parallel max
-    float lmax = psrc0[tpitg[0]];
+    float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
        lmax = MAX(lmax, psrc0[i00]);
    }
@@ -158,7 +158,7 @@ kernel void kernel_soft_max_4(
    device       float4 * pdst4 = (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);

    // parallel max
-    float4 lmax4 = psrc4[tpitg[0]];
+    float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
        lmax4 = fmax(lmax4, psrc4[i00]);
    }
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -36,12 +36,13 @@ KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
 KEY_GENERAL_FILE_TYPE            = "general.file_type"

 # LLM
-KEY_CONTEXT_LENGTH        = "{arch}.context_length"
-KEY_EMBEDDING_LENGTH      = "{arch}.embedding_length"
-KEY_BLOCK_COUNT           = "{arch}.block_count"
-KEY_FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
-KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
-KEY_TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
+KEY_CONTEXT_LENGTH          = "{arch}.context_length"
+KEY_EMBEDDING_LENGTH        = "{arch}.embedding_length"
+KEY_BLOCK_COUNT             = "{arch}.block_count"
+KEY_FEED_FORWARD_LENGTH     = "{arch}.feed_forward_length"
+KEY_USE_PARALLEL_RESIDUAL   = "{arch}.use_parallel_residual"
+KEY_TENSOR_DATA_LAYOUT      = "{arch}.tensor_data_layout"
+KEY_MAX_POSITION_EMBEDDINGS = "{arch}.max_position_embeddings"

 # attention
 KEY_ATTENTION_HEAD_COUNT        = "{arch}.attention.head_count"
@@ -77,12 +78,14 @@ KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"


 class MODEL_ARCH(IntEnum):
-    LLAMA  : int = auto()
-    FALCON : int = auto()
-    GPT2   : int = auto()
-    GPTJ   : int = auto()
-    GPTNEOX: int = auto()
-    MPT    : int = auto()
+    LLAMA         : int = auto()
+    FALCON        : int = auto()
+    BAICHUAN      : int = auto()
+    GPT2          : int = auto()
+    GPTJ          : int = auto()
+    GPTNEOX       : int = auto()
+    MPT           : int = auto()
+    STARCODER     : int = auto()


 class MODEL_TENSOR(IntEnum):
@@ -106,12 +109,14 @@ class MODEL_TENSOR(IntEnum):


 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
-    MODEL_ARCH.LLAMA:   "llama",
-    MODEL_ARCH.FALCON:  "falcon",
-    MODEL_ARCH.GPT2:    "gpt2",
-    MODEL_ARCH.GPTJ:    "gptj",
-    MODEL_ARCH.GPTNEOX: "gptneox",
-    MODEL_ARCH.MPT:     "mpt",
+    MODEL_ARCH.LLAMA:          "llama",
+    MODEL_ARCH.FALCON:         "falcon",
+    MODEL_ARCH.BAICHUAN:       "baichuan",
+    MODEL_ARCH.GPT2:           "gpt2",
+    MODEL_ARCH.GPTJ:           "gptj",
+    MODEL_ARCH.GPTNEOX:        "gptneox",
+    MODEL_ARCH.MPT:            "mpt",
+    MODEL_ARCH.STARCODER:      "starcoder",
 }

 MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
@@ -153,6 +158,34 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
    },
+    MODEL_ARCH.BAICHUAN: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.STARCODER: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.POS_EMBD:      "position_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
    MODEL_ARCH.GPT2: {
        # TODO
    },
@@ -165,6 +198,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
+    MODEL_ARCH.BAICHUAN: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
 }


@@ -187,7 +224,7 @@ class TensorNameMap:
        # Output
        MODEL_TENSOR.OUTPUT: (
            "embed_out", # gptneox
-            "lm_head",   # gpt2 mpt falcon llama-hf
+            "lm_head",   # gpt2 mpt falcon llama-hf baichuan
            "output",    # llama-pth
        ),

@@ -195,7 +232,7 @@ class TensorNameMap:
        MODEL_TENSOR.OUTPUT_NORM: (
            "gpt_neox.final_layer_norm", # gptneox
            "transformer.ln_f",          # gpt2 falcon
-            "model.norm",                # llama-hf
+            "model.norm",                # llama-hf baichuan
            "norm",                      # llama-pth
        ),

@@ -311,6 +348,7 @@ class TensorNameMap:
            tensor_name = tensor_names.get(tensor)
            if tensor_name is None:
                continue
+            mapping[tensor_name] = (tensor, tensor_name)
            for key in keys:
                mapping[key] = (tensor, tensor_name)
        for bid in range(n_blocks):
@@ -319,11 +357,12 @@ class TensorNameMap:
                if tensor_name is None:
                    continue
                tensor_name = tensor_name.format(bid = bid)
+                mapping[tensor_name] = (tensor, tensor_name)
                for key in keys:
                    key = key.format(bid = bid)
                    mapping[key] = (tensor, tensor_name)

-    def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
+    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
        result = self.mapping.get(key)
        if result is not None:
            return result
@@ -334,13 +373,13 @@ class TensorNameMap:
                    return (result[0], result[1] + suffix)
        return None

-    def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
+    def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
        if result is None:
            return None
        return result[1]

-    def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
+    def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
        if result is None:
            return None
@@ -679,6 +718,10 @@ class GGUFWriter:
        self.add_uint32(
            KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)

+    def add_max_position_embeddings(self, length: int):
+        self.add_uint32(
+            KEY_MAX_POSITION_EMBEDDINGS.format(arch=self.arch), length)
+
    def add_block_count(self, length: int):
        self.add_uint32(
            KEY_BLOCK_COUNT.format(arch=self.arch), length)
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.3.2"
+version = "0.3.3"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/llama.cpp
+++ b/llama.cpp
--- a/prompts/chat-with-baichuan.txt
+++ b/prompts/chat-with-baichuan.txt
@@ -0,0 +1,4 @@
+以下内容为人类用户与与一位智能助手的对话。
+
+用户:你好！
+助手:
--- a/scripts/LlamaConfig.cmake.in
+++ b/scripts/LlamaConfig.cmake.in
@@ -0,0 +1,69 @@
+set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
+set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
+set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
+set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
+set(LLAMA_BLAS @LLAMA_BLAS@)
+set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
+set(LLAMA_METAL @LLAMA_METAL@)
+set(LLAMA_MPI @LLAMA_MPI@)
+set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
+set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
+set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
+
+@PACKAGE_INIT@
+
+set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
+set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
+set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+
+# Ensure transient dependencies satisfied
+
+find_package(Threads REQUIRED)
+if (APPLE AND LLAMA_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+endif()
+
+if (LLAMA_BLAS)
+    find_package(BLAS REQUIRED)
+endif()
+
+if (LLAMA_CUBLAS)
+    find_package(CUDAToolkit REQUIRED)
+endif()
+
+if (LLAMA_METAL)
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK Metal REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+endif()
+
+if (LLAMA_MPI)
+    find_package(MPI REQUIRED)
+endif()
+
+if (LLAMA_CLBLAST)
+    find_package(CLBlast REQUIRED)
+endif()
+
+if (LLAMA_HIPBLAS)
+    find_package(hip REQUIRED)
+    find_package(hipblas REQUIRED)
+    find_package(rocblas REQUIRED)
+endif()
+
+find_library(llama_LIBRARY llama
+    REQUIRED
+    HINTS ${LLAMA_LIB_DIR})
+
+set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
+add_library(llama UNKNOWN IMPORTED)
+set_target_properties(llama
+    PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
+        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
+        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+        IMPORTED_LOCATION "${llama_LIBRARY}"
+        INTERFACE_COMPILE_FEATURES cxx_std_11
+        POSITION_INDEPENDENT_CODE ON )
+
+check_required_components(Llama)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -29,9 +29,8 @@ llama_build_executable(test-tokenizer-0-llama.cpp)
 llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 llama_build_executable(test-tokenizer-0-falcon.cpp)
 #llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_build_executable(test-tokenizer-1.cpp)
-# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
-#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_build_executable(test-tokenizer-1-llama.cpp)
+llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 #llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@@ -1,5 +1,6 @@
 #include "llama.h"
 #include "common.h"
+#include "console.h"

 #include <cstdio>
 #include <string>
@@ -89,6 +90,12 @@ int main(int argc, char **argv) {
        return 2;
    }

+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
    bool success = true;

    for (const auto & test_kv : k_tests()) {
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@@ -0,0 +1,127 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <codecvt>
+#include <map>
+#include <vector>
+#include <locale>
+
+typedef int codepoint;
+
+std::string codepoint_to_utf8(codepoint cp) {
+    std::string result;
+    if (0x00 <= cp && cp <= 0x7f) {
+        result.push_back(cp);
+    } else if (0x80 <= cp && cp <= 0x7ff) {
+        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
+        result.push_back(0x80 | (cp & 0x3f));
+    } else if (0x800 <= cp && cp <= 0xffff) {
+        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
+        result.push_back(0x80 | ((cp >> 6) & 0x3f));
+        result.push_back(0x80 | (cp & 0x3f));
+    } else if (0x10000 <= cp && cp <= 0x10ffff) {
+        result.push_back(0xf0 | ((cp >> 18) & 0x07));
+        result.push_back(0x80 | ((cp >> 12) & 0x3f));
+        result.push_back(0x80 | ((cp >> 6) & 0x3f));
+        result.push_back(0x80 | (cp & 0x3f));
+    } else {
+        throw std::invalid_argument("invalid codepoint");
+    }
+    return result;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init(false);
+
+    // load the vocab
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.vocab_only = true;
+
+        model = llama_load_model_from_file(fname.c_str(), lparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        ctx = llama_new_context_with_model(model, lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_free_model(model);
+            return 1;
+        }
+    }
+
+    GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM);
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    const int n_vocab = llama_n_vocab(ctx);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
+        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+        std::string check = llama_detokenize_spm(ctx, tokens);
+        if (check != str) {
+            fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n",
+                __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
+            if(i != 3)
+                return 2;
+        }
+    }
+
+    for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
+        if (cp < 0xd800 || cp > 0xdfff) {
+            std::string str = codepoint_to_utf8(cp);
+            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+            std::string check = llama_detokenize_spm(ctx, tokens);
+            if (str != check) {
+                fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
+                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
+                if(cp != 0 && cp != 9601)
+                    return 3;
+            }
+        }
+    }
+    for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
+        std::string str = codepoint_to_utf8(cp);
+        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+        std::string check = llama_detokenize_spm(ctx, tokens);
+        if (str != check) {
+            fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
+                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
+            return 4;
+        }
+    }
+
+    llama_free_model(model);
+    llama_free(ctx);
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@@ -1,108 +0,0 @@
-#include "llama.h"
-#include "common.h"
-
-#include <cassert>
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <codecvt>
-#include <map>
-#include <vector>
-#include <locale>
-
-static std::string escape_whitespace(const std::string& text) {
-    std::string result = "\xe2\x96\x81";
-    for (size_t offs = 0; offs < text.length(); ++offs) {
-        if (text[offs] == ' ') {
-            result += "\xe2\x96\x81";
-        } else {
-            result += text[offs];
-        }
-    }
-    return result;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init(false);
-
-    // load the vocab
-    {
-        auto lparams = llama_context_default_params();
-
-        lparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), lparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        ctx = llama_new_context_with_model(model, lparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
-
-    const int n_vocab = llama_n_vocab(ctx);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        std::string forward = llama_token_to_piece(ctx, i);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
-        if (tokens.size() == 1) {
-            if (i != tokens[0]) {
-                std::string backward = llama_token_to_piece(ctx, tokens[0]);
-                fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
-                    __func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
-                return 2;
-            }
-        }
-    }
-
-#ifdef _WIN32
-    std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
-    for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
-        std::u16string u16str(1, ch);
-        std::string str = u16converter.to_bytes(u16str);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
-        if (tokens.size() == 1) {
-            fprintf(stderr, "%s : info: %s tokenized to %d \n",
-                __func__, str.c_str(), tokens[0]);
-        }
-    }
-
-    std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
-    for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
-        std::u32string u32str(1, ch);
-        std::string str = u32converter.to_bytes(u32str);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
-        if (tokens.size() == 1) {
-            fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
-        }
-    }
-#endif
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return 0;
-}
Author	SHA1	Message	Date
Georgi Gerganov	92a4f86879	llama : make starcoder graph build more consistent with others	2023-09-15 17:57:10 +03:00
Georgi Gerganov	f82328ab65	metal : fix out-of-bounds access in soft_max kernels	2023-09-15 17:56:49 +03:00
Meng Zhang	6c353dc7c2	cleanup useless code	2023-09-15 19:00:14 +08:00
Meng Zhang	a1cf66ea94	working in cpu, metal buggy	2023-09-15 18:45:43 +08:00
Meng Zhang	101c578715	add TBD	2023-09-15 15:23:50 +08:00
Meng Zhang	8bc76a225d	add input embeddings handling	2023-09-15 14:47:04 +08:00
Meng Zhang	ab13d071e1	store mqa directly	2023-09-15 14:18:36 +08:00
Meng Zhang	4420cff654	fix vram calculation for starcoder	2023-09-15 13:52:43 +08:00
Meng Zhang	dac31da489	fix comments	2023-09-15 12:57:38 +08:00
Meng Zhang	0be15e162c	fix head count kv	2023-09-15 12:56:20 +08:00
Meng Zhang	77c7ec179c	properly load all starcoder params	2023-09-15 12:47:22 +08:00
Meng Zhang	2683611944	set n_positions to max_positioin_embeddings	2023-09-15 12:35:46 +08:00
Meng Zhang	a17ef39792	add max_position_embeddings	2023-09-15 12:35:17 +08:00
Meng Zhang	57f064d7c2	load starcoder weight	2023-09-15 12:12:33 +08:00
Meng Zhang	166a259f67	set head_count_kv = 1	2023-09-15 12:12:27 +08:00
Meng Zhang	7298c37e7e	add LLM_ARCH_STARCODER to llama.cpp	2023-09-15 11:49:21 +08:00
Meng Zhang	7e0a843b6a	fix ffn_down name	2023-09-15 11:45:18 +08:00
Meng Zhang	76d32cca59	convert MQA to MHA	2023-09-15 11:42:16 +08:00
Meng Zhang	eb7f0eba3e	support convert starcoder weights to gguf	2023-09-15 11:24:24 +08:00
Meng Zhang	0c5d4d87b0	add placeholder of starcoder in gguf / llama.cpp	2023-09-15 10:39:47 +08:00
Cebtenzzre	98311c4277	llama : make quantize example up to 2.7x faster (#3115 )	2023-09-14 21:09:53 -04:00
jneem	feea179e9f	flake : allow $out/include to already exist (#3175 )	2023-09-14 21:54:47 +03:00
Andrei	769266a543	cmake : compile ggml-rocm with -fpic when building shared library (#3158 )	2023-09-14 20:38:16 +03:00
Asbjørn Olling	cf8238e7f4	flake : include llama.h in nix output (#3159 )	2023-09-14 20:25:00 +03:00
Cebtenzzre	4b8560e72a	make : fix clang++ detection, move some definitions to CPPFLAGS (#3155 ) * make : fix clang++ detection * make : fix compiler definitions outside of CPPFLAGS	2023-09-14 20:22:47 +03:00
Alon	83a53b753a	CI: add FreeBSD & simplify CUDA windows (#3053 ) * add freebsd to ci * bump actions/checkout to v3 * bump cuda 12.1.0 -> 12.2.0 * bump Jimver/cuda-toolkit version * unify and simplify "Copy and pack Cuda runtime" * install only necessary cuda sub packages	2023-09-14 19:21:25 +02:00
akawrykow	5c872dbca2	falcon : use stated vocab size (#2914 )	2023-09-14 20:19:42 +03:00
bandoti	990a5e226a	cmake : add relocatable Llama package (#2960 ) * Keep static libs and headers with install * Add logic to generate Config package * Use proper build info * Add llama as import library * Prefix target with package name * Add example project using CMake package * Update README * Update README * Remove trailing whitespace	2023-09-14 20:04:40 +03:00
dylan	980ab41afb	docker : add gpu image CI builds (#3103 ) Enables the GPU enabled container images to be built and pushed alongside the CPU containers. Co-authored-by: canardleteer <eris.has.a.dad+github@gmail.com>	2023-09-14 19:47:00 +03:00
Kerfuffle	e394084166	gguf-py : support identity operation in TensorNameMap (#3095 ) Make try_suffixes keyword param optional.	2023-09-14 19:32:26 +03:00
jameswu2014	4c8643dd6e	feature : support Baichuan serial models (#3009 )	2023-09-14 12:32:10 -04:00
Leng Yue	35f73049af	speculative : add heuristic algorithm (#3006 ) * Add heuristic algo for speculative * Constrain minimum n_draft to 2 * speculative : improve heuristic impl * speculative : be more rewarding upon guessing max drafted tokens * speculative : fix typos --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-09-14 19:14:44 +03:00
goerch	71ca2fad7d	whisper : tokenizer fix + re-enable tokenizer test for LLaMa (#3096 ) * Fix für #2721 * Reenable tokenizer test for LLaMa * Add `console.cpp` dependency * Fix dependency to `common` * Fixing wrong fix. * Make console usage platform specific Work on compiler warnings. * Adapting makefile * Remove trailing whitespace * Adapting the other parts of the makefile * Fix typo.	2023-09-13 16:19:44 +03:00