ggml: cache sin/cos for RoPE (#4908 )

metal : remove old API (#4919 )
ggml-ci
2026-05-07 09:34:07 +00:00 · 2024-01-13 21:41:37 +01:00 · 2024-01-13 20:45:45 +02:00 · 2024-01-13 19:31:26 +02:00 · 2024-01-13 18:47:38 +02:00 · 2024-01-13 18:46:37 +02:00
13 changed files with 175 additions and 544 deletions
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -0,0 +1,55 @@
+name: Nix aarch64 builds
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+
+jobs:
+  nix-build-aarch64:
+    if: ${{ vars.CACHIX_NAME != '' }}
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - name: Install QEMU
+      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y qemu-user-static qemu-system-aarch64
+        sudo usermod -a -G kvm $USER
+    - name: Install Nix
+      uses: DeterminateSystems/nix-installer-action@v9
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        extra-conf: |
+          extra-platforms = aarch64-linux
+          extra-system-features = nixos-test kvm
+          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+    - uses: DeterminateSystems/magic-nix-cache-action@v2
+      with:
+        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
+    - name: Set-up cachix to push the results to
+      uses: cachix/cachix-action@v13
+      with:
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+        name: ${{ vars.CACHIX_NAME }}
+    - name: Show all output paths
+      run: >
+          nix run github:nix-community/nix-eval-jobs
+          -- --gc-roots-dir gcroot
+          --flake
+          ".#packages.aarch64-linux"
+    - name: Build
+      run: >
+          nix run github:Mic92/nix-fast-build
+          -- --skip-cached --no-nom
+          --systems aarch64-linux
+          --flake
+          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -69,44 +69,3 @@ jobs:
          -- --skip-cached --no-nom
          --flake
          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build-aarch64:
-    if: ${{ vars.CACHIX_NAME != '' }}
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install QEMU
-      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
-      run: |
-        sudo apt-get install -y qemu-user-static qemu-system-aarch64
-        sudo usermod -a -G kvm $USER
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-platforms = aarch64-linux
-          extra-system-features = nixos-test kvm
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.aarch64-linux"
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --systems aarch64-linux
-          --flake
-          ".#checks.aarch64-linux"
--- a/9
+++ b/9
@@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif

-ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
-BUILD_TARGETS += metal
-endif
-
 default: $(BUILD_TARGETS)

 test: $(TEST_TARGETS)
@@ -671,11 +667,6 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-ifdef LLAMA_METAL
-metal: examples/metal/metal.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-endif
-
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,9 +37,6 @@ else()
    add_subdirectory(lookup)
    add_subdirectory(train-text-from-scratch)
    add_subdirectory(imatrix)
-    if (LLAMA_METAL)
-        add_subdirectory(metal)
-    endif()
    if (LLAMA_BUILD_SERVER)
        add_subdirectory(server)
    endif()
--- a/examples/metal/CMakeLists.txt
+++ b/examples/metal/CMakeLists.txt
@@ -1,4 +0,0 @@
-set(TEST_TARGET metal)
-add_executable(${TEST_TARGET} metal.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -1,103 +0,0 @@
-// Evaluate a statically exported ggml computation graph with Metal
-//
-// - First, export a LLaMA graph:
-//
-//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
-//
-// - Run this tool to evaluate the exported graph:
-//
-//  $ ./bin/metal llama.ggml
-//
-// The purpose of this tool is mostly for debugging and demonstration purposes.
-// The main limitation of exporting computation graphs is that their sizes are static which often
-// can be a problem for real-world applications.
-//
-
-#include "ggml.h"
-#include "ggml-metal.h"
-
-#include <cstdio>
-#include <cstring>
-#include <cstdlib>
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    if (argc != 2) {
-        fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
-        return -1;
-    }
-
-    const char * fname_cgraph = argv[1];
-
-    // load the compute graph
-    struct ggml_context * ctx_data = NULL;
-    struct ggml_context * ctx_eval = NULL;
-
-    struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-
-    // this allocates all Metal resources and memory buffers
-    auto * ctx_metal = ggml_metal_init(1);
-
-    const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
-    const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
-    ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
-    ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
-
-    // main
-    {
-        struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
-        *(int32_t *) input->data = 1; // BOS
-
-        ggml_metal_set_tensor(ctx_metal, input);
-
-        // warmup
-        ggml_metal_graph_compute(ctx_metal, gf);
-
-        const int n_iter = 16;
-
-        const int64_t t0 = ggml_time_us();
-
-        // the actual inference happens here
-        for (int i = 0; i < n_iter; ++i) {
-            ggml_metal_graph_compute(ctx_metal, gf);
-        }
-
-        const int64_t t1 = ggml_time_us();
-
-        printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
-    }
-
-    // debug output
-    {
-        struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
-        ggml_metal_get_tensor(ctx_metal, logits);
-
-        float * ptr = (float *) ggml_get_data(logits);
-
-        printf("logits: ");
-        for (int i = 0; i < 10; i++) {
-            printf("%8.4f ", ptr[i]);
-        }
-        printf("\n");
-        int imax = 0;
-        double sum = 0.0;
-        double vmax = -1e9;
-        for (int i = 0; i < 32000; i++) {
-            sum += (double) ptr[i];
-            if (ptr[i] > vmax) {
-                vmax = ptr[i];
-                imax = i;
-            }
-        }
-        printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
-    }
-
-    ggml_metal_free(ctx_metal);
-
-    ggml_free(ctx_data);
-    ggml_free(ctx_eval);
-
-    return 0;
-}
-
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -45,13 +45,13 @@ int main(int argc, char ** argv) {
    // save state (rng, logits, embedding and kv_cache) to file
    {
        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
+        const size_t written = llama_copy_state_data(ctx, state_mem.data());

-        {
-            FILE *fp_write = fopen("dump_state.bin", "wb");
-            llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
-            fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
-            fclose(fp_write);
-        }
+        FILE *fp_write = fopen("dump_state.bin", "wb");
+        fwrite(state_mem.data(), 1, written, fp_write);
+        fclose(fp_write);
+
+        fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size());
    }

    // save state (last tokens)
@@ -100,18 +100,17 @@ int main(int argc, char ** argv) {
        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));

        FILE * fp_read = fopen("dump_state.bin", "rb");
+        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        fclose(fp_read);

-        const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
-        if (ret != state_mem.size()) {
+        if (read != llama_set_state_data(ctx2, state_mem.data())) {
            fprintf(stderr, "\n%s : failed to read state\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
            return 1;
        }

-        llama_set_state_data(ctx2, state_mem.data());
-
-        fclose(fp_read);
+        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
    }

    // restore state (last tokens)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1180,8 +1180,9 @@ struct llama_server_context
        return slot.images.size() > 0;
    }

-    void send_error(task_server& task, std::string error)
+    void send_error(task_server& task, const std::string &error)
    {
+        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
        std::unique_lock<std::mutex> lock(mutex_results);
        task_result res;
        res.id = task.id;
@@ -1570,12 +1571,22 @@ struct llama_server_context
                        LOG_TEE("slot unavailable\n");
                        // send error result
                        send_error(task, "slot unavailable");
-                        return;
+                        break;
                    }

                    if (task.data.contains("system_prompt"))
                    {
+                        if (!all_slots_are_idle) {
+                            send_error(task, "system prompt can only be updated when all slots are idle");
+                            break;
+                        }
                        process_system_prompt_data(task.data["system_prompt"]);
+
+                        // reset cache_tokens for all slots
+                        for (llama_client_slot &slot : slots)
+                        {
+                            slot.cache_tokens.clear();
+                        }
                    }

                    slot->reset();
@@ -1652,8 +1663,7 @@ struct llama_server_context
        // attend tasks
        process_tasks();

-        // update the system prompt wait until all slots are idle state
-        if (system_need_update && all_slots_are_idle)
+        if (system_need_update)
        {
            LOG_TEE("updating system prompt\n");
            update_system_prompt();
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -36,64 +36,13 @@ struct ggml_cgraph;
 extern "C" {
 #endif

-//
-// internal API
-// temporary exposed to user-code
-//
-
-struct ggml_metal_context;
-
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
-
-// number of command buffers to use
-struct ggml_metal_context * ggml_metal_init(int n_cb);
-void ggml_metal_free(struct ggml_metal_context * ctx);
-
-void * ggml_metal_host_malloc(size_t n);
-void   ggml_metal_host_free  (void * data);
-
-// set the number of command buffers to use
-void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
-
-// creates a mapping between a host memory buffer and a device memory buffer
-// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
-// - the mapping is used during computation to determine the arguments of the compute kernels
-// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
-// - max_size specifies the maximum size of a tensor and is used to create shared views such
-//   that it is guaranteed that the tensor will fit in at least one of the views
-//
-bool ggml_metal_add_buffer(
-        struct ggml_metal_context * ctx,
-                       const char * name,
-                             void * data,
-                           size_t   size,
-                           size_t   max_size);
-
-// set data from host memory into the device
-void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// get data from the device into host memory
-void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// try to find operations that can be run concurrently in the graph
-// you should run it again if the topology of your graph changes
-void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
-
-// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
-int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
-
-// output the concur_list for ggml_alloc
-int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
-
-// same as ggml_graph_compute but uses Metal
-// creates gf->n_threads command buffers in parallel
-bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
-
 //
 // backend API
 // user-code should use only these functions
 //

+GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
 GGML_API ggml_backend_t ggml_backend_metal_init(void);

 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -24,8 +24,6 @@

 #define UNUSED(x) (void)(x)

-#define GGML_MAX_CONCUR (2*GGML_DEFAULT_GRAPH_SIZE)
-
 #define GGML_METAL_MAX_KERNELS 256

 struct ggml_metal_buffer {
@@ -182,9 +180,6 @@ struct ggml_metal_context {

    struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];

-    int concur_list[GGML_MAX_CONCUR];
-    int concur_list_len;
-
    bool support_simdgroup_reduction;
    bool support_simdgroup_mm;
 };
@@ -200,7 +195,6 @@ struct ggml_metal_context {
@implementation GGMLMetalClass
@end

-
 static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
    fprintf(stderr, "%s", msg);

@@ -211,11 +205,6 @@ static void ggml_metal_default_log_callback(enum ggml_log_level level, const cha
 ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
 void * ggml_metal_log_user_data = NULL;

-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
-    ggml_metal_log_callback  = log_callback;
-    ggml_metal_log_user_data = user_data;
-}
-
 GGML_ATTRIBUTE_FORMAT(2, 3)
 static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
    if (ggml_metal_log_callback != NULL) {
@@ -238,7 +227,18 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
    }
 }

-struct ggml_metal_context * ggml_metal_init(int n_cb) {
+static void * ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+    if (result != 0) {
+        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
+}
+
+static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    GGML_METAL_LOG_INFO("%s: allocating\n", __func__);

    id<MTLDevice> device;
@@ -264,7 +264,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
-    ctx->concur_list_len = 0;

    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);

@@ -398,9 +397,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
            struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
            kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
-            GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
-                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
-                    (int) kernel->pipeline.threadExecutionWidth); \
            if (error) { \
                GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
                return NULL; \
@@ -534,7 +530,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    return ctx;
 }

-void ggml_metal_free(struct ggml_metal_context * ctx) {
+static void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);

    for (int i = 0; i < ctx->n_buffers; ++i) {
@@ -560,33 +556,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    free(ctx);
 }

-void * ggml_metal_host_malloc(size_t n) {
-    void * data = NULL;
-    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
-    if (result != 0) {
-        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
-        return NULL;
-    }
-
-    return data;
-}
-
-void ggml_metal_host_free(void * data) {
-    free(data);
-}
-
-void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
-    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
-}
-
-int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
-    return ctx->concur_list_len;
-}
-
-int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
-    return ctx->concur_list;
-}
-
 // temporarily defined here for compatibility between ggml-backend and the old API

 struct ggml_backend_metal_buffer {
@@ -659,209 +628,6 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
    return nil;
 }

-bool ggml_metal_add_buffer(
-        struct ggml_metal_context * ctx,
-                     const char * name,
-                           void * data,
-                         size_t   size,
-                         size_t   max_size) {
-    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
-        GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
-        return false;
-    }
-
-    if (data) {
-        // verify that the buffer does not overlap with any of the existing buffers
-        for (int i = 0; i < ctx->n_buffers; ++i) {
-            const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
-
-            if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
-                GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
-                return false;
-            }
-        }
-
-        const size_t size_page = sysconf(_SC_PAGESIZE);
-
-        size_t size_aligned = size;
-        if ((size_aligned % size_page) != 0) {
-            size_aligned += (size_page - (size_aligned % size_page));
-        }
-
-        // the buffer fits into the max buffer size allowed by the device
-        if (size_aligned <= ctx->device.maxBufferLength) {
-            ctx->buffers[ctx->n_buffers].name = name;
-            ctx->buffers[ctx->n_buffers].data = data;
-            ctx->buffers[ctx->n_buffers].size = size;
-
-            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
-                return false;
-            }
-
-            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
-
-            ++ctx->n_buffers;
-        } else {
-            // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
-            // one of the views
-            const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
-            const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
-            const size_t size_view = ctx->device.maxBufferLength;
-
-            for (size_t i = 0; i < size; i += size_step) {
-                const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
-
-                ctx->buffers[ctx->n_buffers].name = name;
-                ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
-                ctx->buffers[ctx->n_buffers].size = size_step_aligned;
-
-                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-                if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
-                    return false;
-                }
-
-                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
-                if (i + size_step < size) {
-                    GGML_METAL_LOG_INFO("\n");
-                }
-
-                ++ctx->n_buffers;
-            }
-        }
-
-#if TARGET_OS_OSX
-        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
-                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-
-        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
-            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-        } else {
-            GGML_METAL_LOG_INFO("\n");
-        }
-#else
-        GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
-#endif
-    }
-
-    return true;
-}
-
-void ggml_metal_set_tensor(
-        struct ggml_metal_context * ctx,
-        struct ggml_tensor * t) {
-    size_t offs;
-    id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
-
-    memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
-}
-
-void ggml_metal_get_tensor(
-        struct ggml_metal_context * ctx,
-        struct ggml_tensor * t) {
-    size_t offs;
-    id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
-
-    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
-}
-
-void ggml_metal_graph_find_concurrency(
-        struct ggml_metal_context * ctx,
-        struct ggml_cgraph * gf, bool check_mem) {
-    int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
-    int nodes_unused[GGML_MAX_CONCUR];
-
-    for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
-    for (int i = 0; i < gf->n_nodes;     i++) { nodes_unused[i]     = 1; }
-    ctx->concur_list_len = 0;
-
-    int n_left    = gf->n_nodes;
-    int n_start   = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
-    int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
-
-    while (n_left > 0) {
-        // number of nodes at a layer (that can be issued concurrently)
-        int concurrency = 0;
-        for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
-            if (nodes_unused[i]) {
-                // if the requirements for gf->nodes[i] are satisfied
-                int exe_flag = 1;
-
-                // scan all srcs
-                for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
-                    struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
-                    if (src_cur) {
-                        // if is leaf nodes it's satisfied.
-                        // TODO: ggml_is_leaf()
-                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
-                            continue;
-                        }
-
-                        // otherwise this src should be the output from previous nodes.
-                        int is_found = 0;
-
-                        // scan 2*search_depth back because we inserted barrier.
-                        //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
-                        for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
-                            if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
-                                is_found = 1;
-                                break;
-                            }
-                        }
-                        if (is_found == 0) {
-                            exe_flag = 0;
-                            break;
-                        }
-                    }
-                }
-                if (exe_flag && check_mem) {
-                    // check if nodes[i]'s data will be overwritten by a node before nodes[i].
-                    // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
-                    int64_t data_start = (int64_t) gf->nodes[i]->data;
-                    int64_t length     = (int64_t) ggml_nbytes(gf->nodes[i]);
-                    for (int j = n_start; j < i; j++) {
-                        if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
-                                            && gf->nodes[j]->op != GGML_OP_VIEW \
-                                            && gf->nodes[j]->op != GGML_OP_TRANSPOSE \
-                                            && gf->nodes[j]->op != GGML_OP_PERMUTE) {
-                            if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
-                                ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
-                                continue;
-                            }
-
-                            exe_flag = 0;
-                        }
-                    }
-                }
-                if (exe_flag) {
-                    ctx->concur_list[level_pos + concurrency] = i;
-                    nodes_unused[i] = 0;
-                    concurrency++;
-                    ctx->concur_list_len++;
-                }
-            }
-        }
-        n_left -= concurrency;
-        // adding a barrier different layer
-        ctx->concur_list[level_pos + concurrency] = -1;
-        ctx->concur_list_len++;
-        // jump all sorted nodes at nodes_bak
-        while (!nodes_unused[n_start]) {
-            n_start++;
-        }
-        level_pos += concurrency + 1;
-    }
-
-    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
-        GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
-    }
-}
-
 static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_UNARY:
@@ -943,19 +709,15 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
    }
 }

-bool ggml_metal_graph_compute(
+static bool ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
    @autoreleasepool {

-    // if there is ctx->concur_list, dispatch concurrently
-    // else fallback to serial dispatch
    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;

-    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
-
-    const int n_nodes  = has_concur ? ctx->concur_list_len      : gf->n_nodes;
-    edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
+    const int n_nodes  = gf->n_nodes;
+    edesc.dispatchType = MTLDispatchTypeSerial;

    // create multiple command buffers and enqueue them
    // then, we encode the graph into the command buffers in parallel
@@ -986,7 +748,7 @@ bool ggml_metal_graph_compute(
            const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);

            for (int ind = node_start; ind < node_end; ++ind) {
-                const int i = has_concur ? ctx->concur_list[ind] : ind;
+                const int i = ind;

                if (i == -1) {
                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
@@ -2826,6 +2588,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
    /* .supports_op             = */ ggml_backend_metal_supports_op,
 };

+void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
+    ggml_metal_log_callback  = log_callback;
+    ggml_metal_log_user_data = user_data;
+}
+
 ggml_backend_t ggml_backend_metal_init(void) {
    struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);

@@ -2852,7 +2619,7 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {

    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;

-    ggml_metal_set_n_cb(ctx, n_cb);
+    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
 }

 bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
--- a/ggml.c
+++ b/ggml.c
@@ -11638,6 +11638,21 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl
    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
 }

+static void ggml_rope_cache_init(
+     float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+     float * cache, float sin_sign, float theta_scale
+) {
+    float theta = theta_base;
+    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        rope_yarn(
+            theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+        );
+        cache[i0 + 1] *= sin_sign;
+
+        theta *= theta_scale;
+    }
+}
+
 void ggml_rope_yarn_corr_dims(
    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
 ) {
@@ -11720,6 +11735,12 @@ static void ggml_compute_forward_rope_f32(
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
            const int64_t p = pos[i2];
+
+            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
+            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
+                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
@@ -11753,18 +11774,13 @@ static void ggml_compute_forward_rope_f32(
                    }
                } else if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        float cos_theta, sin_theta;
-                        rope_yarn(
-                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
-                        );
-                        sin_theta *= sin_sign;
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];

                        // zeta scaling for xPos only:
                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                        if (xpos_down) zeta = 1.0f / zeta;

-                        theta_base *= theta_scale;
-
                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

@@ -11888,6 +11904,12 @@ static void ggml_compute_forward_rope_f16(
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
            const int64_t p = pos[i2];
+
+            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
+            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
+                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
@@ -11921,13 +11943,8 @@ static void ggml_compute_forward_rope_f16(
                    }
                } else if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        float cos_theta, sin_theta;
-                        rope_yarn(
-                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
-                        );
-                        sin_theta *= sin_sign;
-
-                        theta_base *= theta_scale;
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];

                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -16722,6 +16739,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                    }
                } break;
            case GGML_OP_SOFT_MAX:
+            case GGML_OP_ROPE:
                {
                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                } break;
--- a/llama.cpp
+++ b/llama.cpp
@@ -1266,7 +1266,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
 struct llama_state {
    llama_state() {
 #ifdef GGML_USE_METAL
-        ggml_metal_log_set_callback(log_callback, log_callback_user_data);
+        ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
 #endif
    }

@@ -9379,12 +9379,8 @@ struct llama_context * llama_new_context_with_model(
                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
        }

-        // resized during inference
-        if (params.logits_all) {
-            ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
-        } else {
-            ctx->logits.reserve(hparams.n_vocab);
-        }
+        // resized during inference, reserve maximum
+        ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);

        if (params.embedding){
            ctx->embedding.resize(hparams.n_embd);
@@ -9731,8 +9727,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
    // for reference, std::mt19937(1337) serializes to 6701 bytes.
    const size_t s_rng_size        = sizeof(size_t);
    const size_t s_rng             = LLAMA_MAX_RNG_STATE;
-    const size_t s_logits_capacity = sizeof(size_t);
    const size_t s_logits_size     = sizeof(size_t);
+    // assume worst case for logits although only currently set ones are serialized
    const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
    const size_t s_embedding_size  = sizeof(size_t);
    const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
@@ -9743,7 +9739,6 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
    const size_t s_total = (
        + s_rng_size
        + s_rng
-        + s_logits_capacity
        + s_logits_size
        + s_logits
        + s_embedding_size
@@ -9812,37 +9807,27 @@ struct llama_data_file_context : llama_data_context {
 static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
    // copy rng
    {
-        std::stringstream rng_ss;
+        std::ostringstream rng_ss;
        rng_ss << ctx->rng;

-        const size_t rng_size = rng_ss.str().size();
-        char rng_buf[LLAMA_MAX_RNG_STATE];
+        const std::string & rng_str = rng_ss.str();
+        const size_t        rng_size = rng_str.size();

-        memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
-        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
+        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);

-        data_ctx->write(&rng_size,   sizeof(rng_size));
-        data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
+        data_ctx->write(&rng_size,      sizeof(rng_size));
+        data_ctx->write(rng_str.data(), rng_size);
    }

    // copy logits
    {
-        const size_t logits_cap  = ctx->logits.capacity();
        const size_t logits_size = ctx->logits.size();

-        data_ctx->write(&logits_cap,  sizeof(logits_cap));
        data_ctx->write(&logits_size, sizeof(logits_size));

        if (logits_size) {
            data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
        }
-
-        // If there is a gap between the size and the capacity, write padding
-        size_t padding_size = (logits_cap - logits_size) * sizeof(float);
-        if (padding_size > 0) {
-            std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
-            data_ctx->write(padding.data(), padding_size);
-        }
    }

    // copy embeddings
@@ -9925,13 +9910,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
    // set rng
    {
        size_t rng_size;
-        char   rng_buf[LLAMA_MAX_RNG_STATE];
+        memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);

-        memcpy(&rng_size,   inp, sizeof(rng_size));    inp += sizeof(rng_size);
-        memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
+        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);

-        std::stringstream rng_ss;
-        rng_ss.str(std::string(&rng_buf[0], rng_size));
+        std::string rng_str((char *)inp, rng_size); inp += rng_size;
+
+        std::istringstream rng_ss(rng_str);
        rng_ss >> ctx->rng;

        GGML_ASSERT(!rng_ss.fail());
@@ -9939,20 +9924,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {

    // set logits
    {
-        size_t logits_cap;
        size_t logits_size;

-        memcpy(&logits_cap,  inp, sizeof(logits_cap));  inp += sizeof(logits_cap);
        memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);

-        GGML_ASSERT(ctx->logits.capacity() == logits_cap);
+        GGML_ASSERT(ctx->logits.capacity() >= logits_size);

        if (logits_size) {
            ctx->logits.resize(logits_size);
-            memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
-        }

-        inp += logits_cap * sizeof(float);
+            memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
+            inp += logits_size * sizeof(float);
+        }
    }

    // set embeddings
@@ -10322,6 +10305,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
    if (0 <= token && token < llama_n_vocab(model)) {
        switch (llama_vocab_get_type(model->vocab)) {
        case LLAMA_VOCAB_TYPE_SPM: {
+            // NOTE: we accept all unsupported token types,
+            // suppressing them like CONTROL tokens.
            if (llama_is_normal_token(model->vocab, token)) {
                std::string result = model->vocab.id_to_token[token].text;
                llama_unescape_whitespace(result);
@@ -10330,6 +10315,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
+            } else if (llama_is_user_defined_token(model->vocab, token)) {
+                std::string result = model->vocab.id_to_token[token].text;
+                if (length < (int) result.length()) {
+                    return -result.length();
+                }
+                memcpy(buf, result.c_str(), result.length());
+                return result.length();
            } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
                if (length < 3) {
                    return -3;
@@ -10344,14 +10336,12 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
                }
                buf[0] = llama_token_to_byte(model->vocab, token);
                return 1;
-            } else {
-                // TODO: for now we accept all unsupported token types,
-                // suppressing them like CONTROL tokens.
-                // GGML_ASSERT(false);
            }
            break;
        }
        case LLAMA_VOCAB_TYPE_BPE: {
+            // NOTE: we accept all unsupported token types,
+            // suppressing them like CONTROL tokens.
            if (llama_is_normal_token(model->vocab, token)) {
                std::string result = model->vocab.id_to_token[token].text;
                result = llama_decode_text(result);
@@ -10360,12 +10350,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
+            } else if (llama_is_user_defined_token(model->vocab, token)) {
+                std::string result = model->vocab.id_to_token[token].text;
+                if (length < (int) result.length()) {
+                    return -result.length();
+                }
+                memcpy(buf, result.c_str(), result.length());
+                return result.length();
            } else if (llama_is_control_token(model->vocab, token)) {
                ;
-            } else {
-                // TODO: for now we accept all unsupported token types,
-                // suppressing them like CONTROL tokens.
-                // GGML_ASSERT(false);
            }
            break;
        }
@@ -10477,7 +10470,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
    g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
    g_state.log_callback_user_data = user_data;
 #ifdef GGML_USE_METAL
-    ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
+    ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
 #endif
 }

--- a/llama.h
+++ b/llama.h
@@ -43,7 +43,7 @@
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 3
+#define LLAMA_SESSION_VERSION 4

 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
Author	SHA1	Message	Date
Johannes Gäßler	c71d608ce7	ggml: cache sin/cos for RoPE (#4908 )	2024-01-13 21:41:37 +01:00
Georgi Gerganov	4be5ef556d	metal : remove old API (#4919 ) ggml-ci	2024-01-13 20:45:45 +02:00
Georgi Gerganov	0ea069b87b	server : fix prompt caching with system prompt (#4914 )	2024-01-13 19:31:26 +02:00
Georgi Gerganov	f172de03f1	llama : fix detokenization of non-special added-tokens (#4916 ) Co-authored-by: goerch <jhr.walter@t-online.de>	2024-01-13 18:47:38 +02:00
Georgi Gerganov	2d57de5255	metal : disable log for loaded kernels (#4794 )	2024-01-13 18:46:37 +02:00
David Friehs	df845cc982	llama : minimize size used for state save/load (#4820 ) * examples : save-load-state: save only required state * llama : only reserve n_vocab * n_batch at most for logits llama_decode asserts that only n_batch tokens are passed each call, and n_ctx is expected to be bigger than n_batch. * llama : always reserve n_vocab * n_batch for logits llama_context de-serialization breaks if the contexts have differing capacity for logits and llama_decode will at maximum resize to n_vocab * n_batch. * llama : only save and restore used logits for batch sizes of 512 this reduces save state in the best case by around 62 MB, which can be a lot if planning to save on each message to allow regenerating messages. * llama : use ostringstream and istringstream for save and load * llama : serialize rng into minimum amount of space required * llama : break session version due to serialization changes	2024-01-13 18:29:43 +02:00
Someone	6b48ed0893	workflows: unbreak nix-build-aarch64, and split it out (#4915 ) The fix should be just the `sudo apt-get update`	2024-01-13 16:29:16 +00:00