ggml : do not sched_yield when calling BLAS (#4761 )

* ggml : do not sched_yield when calling BLAS ggml-ci * ggml : fix do_yield logic ggml-ci * ggml : simplify do_yield logic ggml-ci
examples : add few-shot translation example (#4783 )
2026-05-06 09:04:07 +00:00 · 2024-01-05 15:18:21 +02:00 · 2024-01-05 15:11:10 +02:00 · 2024-01-04 21:45:37 +02:00 · 2024-01-04 19:56:33 +02:00 · 2024-01-04 09:43:23 +01:00
18 changed files with 601 additions and 118 deletions
--- a/Package.swift
+++ b/Package.swift
@@ -13,21 +13,17 @@ let package = Package(
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
+    dependencies: [
+        .package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
+    ],
    targets: [
        .target(
            name: "llama",
+            dependencies: ["ggml"],
            path: ".",
            exclude: [],
            sources: [
-                "ggml.c",
                "llama.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m",
-            ],
-            resources: [
-                .process("ggml-metal.metal")
            ],
            publicHeadersPath: "spm-headers",
            cSettings: [
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1107,7 +1107,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
+    fprintf(stderr, "  --overlapping-samples      Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+#
+# Few-shot translation example.
+# Requires a base model (i.e. no fine-tuned or instruct models).
+#
+# Usage:
+#
+#   cd llama.cpp
+#   make -j
+#
+#   ./examples/base-translate.sh <model-base> "<text>"
+#
+
+if [ $# -ne 2 ]; then
+  echo "Usage: ./base-translate.sh <model-base> \"<text>\""
+  exit 1
+fi
+
+ftmp="__llama.cpp_example_tmp__.txt"
+trap "rm -f $ftmp" EXIT
+
+echo "Translate from English to French:
+
+===
+
+sea otter, peppermint, plush girafe:
+
+sea otter => loutre de mer
+peppermint => menthe poivrée
+plush girafe => girafe peluche
+
+===
+
+violin
+
+violin => violon
+
+===
+
+phone, computer, mouse, keyboard:
+
+phone => téléphone
+computer => ordinateur
+mouse => souris
+keyboard => clavier
+
+===
+" > $ftmp
+
+echo "$2
+" >> $ftmp
+
+model=$1
+
+# generate the most likely continuation, run on the CPU until the string "===" is found
+./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -ngl 0 -r "==="
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -3,15 +3,9 @@
 #include "llama.h"
 #include "common.h"
 #include "train.h"
-#include <unordered_map>
 #include <vector>
-#include <cassert>
-#include <climits>
 #include <cstring>
-#include <cstdarg>
 #include <ctime>
-#include <random>
-#include <stdexcept>
 #include <algorithm>
 #include <string>

--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -9,7 +9,6 @@
 /* Begin PBXBuildFile section */
 		542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
 		5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
-		542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
 		542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
 		542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
 		542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
@@ -24,8 +23,26 @@
 		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
 		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
+		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
+		F1FE20DC2B465C4500B45541 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
 /* End PBXBuildFile section */

+/* Begin PBXBuildRule section */
+		F1FE20DB2B465C2100B45541 /* PBXBuildRule */ = {
+			isa = PBXBuildRule;
+			compilerSpec = com.apple.compilers.proxy.script;
+			fileType = sourcecode.metal;
+			inputFiles = (
+			);
+			isEditable = 1;
+			outputFiles = (
+				"${DERIVED_FILES_DIR}/ggml-metal.air",
+				"${DERIVED_FILES_DIR}/ggml.metallib",
+			);
+			script = "# metal\nxcrun metal    -c \"${INPUT_FILE_PATH}\" -o \"${DERIVED_FILES_DIR}/${INPUT_FILE_BASE}.air\"\nxcrun metallib -o \"${DERIVED_FILES_DIR}/${INPUT_FILE_BASE%-metal}.metallib\" \"${DERIVED_FILES_DIR}/${INPUT_FILE_BASE}.air\"\n";
+		};
+/* End PBXBuildRule section */
+
 /* Begin PBXFileReference section */
 		542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
 		542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
@@ -52,6 +69,7 @@
 		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
 		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
 		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
+		F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@@ -166,6 +184,7 @@
 			children = (
 				7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
 				8A1C83782AC328BD0096AF73 /* ContentView.swift */,
+				F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */,
 			);
 			path = UI;
 			sourceTree = "<group>";
@@ -190,6 +209,7 @@
 				8A1C83712AC328BD0096AF73 /* Resources */,
 			);
 			buildRules = (
+				F1FE20DB2B465C2100B45541 /* PBXBuildRule */,
 			);
 			dependencies = (
 			);
@@ -241,7 +261,7 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
+				F1FE20DC2B465C4500B45541 /* ggml-metal.metal in Resources */,
 				8A3F84242AC4C891005E2EE8 /* models in Resources */,
 				8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
 				8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
@@ -257,6 +277,7 @@
 			files = (
 				542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
 				549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
+				F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */,
 				542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
 				8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
 				542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@@ -103,6 +103,8 @@ struct ContentView: View {
                    ContentView.cleanupModelCaches()
                    llamaState.cacheCleared = true
                }
+
+                LoadCustomButton(llamaState: llamaState)
            }
            .padding(.top, 4)
            .font(.system(size: 12))
--- a/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
@@ -0,0 +1,44 @@
+import SwiftUI
+import UniformTypeIdentifiers
+
+struct LoadCustomButton: View {
+    @ObservedObject private var llamaState: LlamaState
+    @State private var showFileImporter = false
+
+    init(llamaState: LlamaState) {
+        self.llamaState = llamaState
+    }
+
+    var body: some View {
+        VStack {
+            Button(action: {
+                showFileImporter = true
+            }) {
+                Text("Load Custom Model")
+            }
+        }
+        .fileImporter(
+            isPresented: $showFileImporter,
+            allowedContentTypes: [UTType(filenameExtension: "gguf", conformingTo: .data)!],
+            allowsMultipleSelection: false
+        ) { result in
+            switch result {
+            case .success(let files):
+                files.forEach { file in
+                    let gotAccess = file.startAccessingSecurityScopedResource()
+                    if !gotAccess { return }
+
+                    do {
+                        try llamaState.loadModel(modelUrl: file.absoluteURL)
+                    } catch let err {
+                        print("Error: \(err.localizedDescription)")
+                    }
+
+                    file.stopAccessingSecurityScopedResource()
+                }
+            case .failure(let error):
+                print(error)
+            }
+        }
+    }
+}
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -168,6 +168,12 @@ node index.js

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

+    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+
+    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
+
+    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+
    *Result JSON:*

    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
@@ -198,12 +204,6 @@ node index.js

    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)

-    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
-
-    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
-
-    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-
 -   **POST** `/tokenize`: Tokenize a given text.

    *Options:*
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -95,6 +95,15 @@ export async function* llama(prompt, params = {}, config = {}) {
              break;
            }
          }
+          if (result.error) {
+            result.error = JSON.parse(result.error);
+            if (result.error.content.includes('slot unavailable')) {
+              // Throw an error to be caught by upstream callers
+              throw new Error('slot unavailable');
+            } else {
+              console.error(`llama.cpp error: ${result.error.content}`);
+            }
+          }
          if (result.error) {
            result.error = JSON.parse(result.error);
            console.error(`llama.cpp error: ${result.error.content}`);
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1265,7 +1265,7 @@ struct llama_server_context
        {
            std::vector<completion_token_output> probs_output = {};
            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
-            size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
+            size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size());
            size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
            if (probs_pos < probs_stop_pos)
            {
@@ -1325,7 +1325,7 @@ struct llama_server_context
            {
                probs = std::vector<completion_token_output>(
                                    slot.generated_token_probs.begin(),
-                                    slot.generated_token_probs.begin() + slot.sent_token_probs_index);
+                                    slot.generated_token_probs.end());
            }
            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
        }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -10039,14 +10039,19 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
                }
                return false;
            } break;
+        case GGML_OP_DUP:
+        case GGML_OP_REPEAT:
+        case GGML_OP_CONCAT:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+            } break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
-        case GGML_OP_REPEAT:
-        case GGML_OP_DUP:
        case GGML_OP_ADD:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
@@ -10063,7 +10068,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
-        case GGML_OP_CONCAT:
        case GGML_OP_GROUP_NORM:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -5,6 +5,7 @@
 // GGML internal header

 #include <assert.h>
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 #include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -87,6 +87,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_q4_K);
    GGML_METAL_DECL_KERNEL(get_rows_q5_K);
    GGML_METAL_DECL_KERNEL(get_rows_q6_K);
+    GGML_METAL_DECL_KERNEL(get_rows_i32);
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(group_norm);
    GGML_METAL_DECL_KERNEL(norm);
@@ -377,6 +378,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_q4_K);
        GGML_METAL_ADD_KERNEL(get_rows_q5_K);
        GGML_METAL_ADD_KERNEL(get_rows_q6_K);
+        GGML_METAL_ADD_KERNEL(get_rows_i32);
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(group_norm);
        GGML_METAL_ADD_KERNEL(norm);
@@ -499,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(get_rows_q4_K);
    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
+    GGML_METAL_DEL_KERNEL(get_rows_i32);
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(group_norm);
    GGML_METAL_DEL_KERNEL(norm);
@@ -1657,6 +1660,10 @@ void ggml_metal_graph_compute(
                                        }
                                };

+                                if (ggml_is_quantized(src0t)) {
+                                    GGML_ASSERT(ne00 >= nth0*nth1);
+                                }
+
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -1715,6 +1722,9 @@ void ggml_metal_graph_compute(
                            // TODO: make this more general
                            GGML_ASSERT(n_as <= 8);

+                            // max size of the src1ids array in the kernel stack
+                            GGML_ASSERT(ne11 <= 512);
+
                            struct ggml_tensor * src2 = gf->nodes[i]->src[2];

                            const int64_t  ne20 = src2 ? src2->ne[0] : 0;
@@ -1732,9 +1742,6 @@ void ggml_metal_graph_compute(
                            GGML_ASSERT(!ggml_is_transposed(src2));
                            GGML_ASSERT(!ggml_is_transposed(src1));

-                            GGML_ASSERT(ne20 % 32 == 0);
-                            // !!!!!!!!! TODO: this assert is probably required but not sure!
-                            //GGML_ASSERT(ne20 >= 64);
                            GGML_ASSERT(src1t == GGML_TYPE_F32);

                            const uint r2 = ne12/ne22;
@@ -1742,22 +1749,22 @@ void ggml_metal_graph_compute(

                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                            // to the matrix-vector kernel
-                            int ne11_mm_min = 1;
+                            int ne11_mm_min = n_as;

                            const int idx = ((int32_t *) dst->op_params)[0];

                            // batch size
                            GGML_ASSERT(ne01 == ne11);

-                            const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
-
                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                            // !!!
                            // TODO: for now, always use mat-vec kernels until we figure out how to improve the
                            //       indirect matrix multiplication
                            // !!!
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
+                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                ne20 % 32 == 0 && ne20 >= 64 &&
+                                ne11 > ne11_mm_min) {
                                switch (src2->type) {
                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32];  break;
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32];  break;
@@ -1787,7 +1794,7 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&_ne1    length:sizeof(_ne1) atIndex:14];
+                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
                                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
@@ -1805,8 +1812,7 @@ void ggml_metal_graph_compute(

                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];

-                                // TODO: processing one row at a time (ne11 -> 1) is not efficient
-                                [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                            } else {
                                int nth0 = 32;
                                int nth1 = 1;
@@ -1889,11 +1895,17 @@ void ggml_metal_graph_compute(
                                        } break;
                                    default:
                                        {
-                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
                                            GGML_ASSERT(false && "not implemented");
                                        }
                                };

+                                if (ggml_is_quantized(src2t)) {
+                                    GGML_ASSERT(ne20 >= nth0*nth1);
+                                }
+
+                                const int64_t _ne1 = 1; // kernels needs a reference in constant memory
+
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -1969,6 +1981,7 @@ void ggml_metal_graph_compute(
                                case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
                                case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
                                case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
+                                case GGML_TYPE_I32:  [encoder setComputePipelineState:ctx->pipeline_get_rows_i32];  break;
                                default: GGML_ASSERT(false && "not implemented");
                            }

--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -846,7 +846,7 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
 #define N_SIMDGROUP 2  // number of SIMD groups in a thread group
 //Note: This is a template, but strictly speaking it only applies to
 //      quantizations where the block size is 32. It also does not
-//      giard against the number of rows not being divisible by
+//      guard against the number of rows not being divisible by
 //      N_DST, so this is another explicit assumption of the implementation.
 template<typename block_q_type, int nr, int nsg, int nw>
 void mul_vec_q_n_f32_impl(
@@ -3829,6 +3829,35 @@ kernel void kernel_get_rows_f16(
    }
 }

+kernel void kernel_get_rows_i32(
+        device const  void * src0,
+        device const  char * src1,
+        device     int32_t * dst,
+        constant   int64_t & ne00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        uint3                tgpig[[threadgroup_position_in_grid]],
+        uint                 tiitg[[thread_index_in_threadgroup]],
+        uint3                tptg [[threads_per_threadgroup]]) {
+    const int64_t i10 = tgpig.x;
+    const int64_t i11 = tgpig.y;
+
+    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    const int64_t i02 = i11;
+
+    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
+        ((device int32_t *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
+            ((device int32_t *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
+    }
+}
+
+
 #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
 #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
 #define BLOCK_SIZE_K 32
@@ -3973,6 +4002,131 @@ void kernel_mul_mm_impl(device const  uchar * src0,
    }
 }

+// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in src1ids
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
+void kernel_mul_mm_id_impl(
+        device const  uchar * src0,
+        device const  uchar * src1,
+        thread        short * src1ids,
+        device        float * dst,
+        constant    int64_t & ne00,
+        constant    int64_t & ne02,
+        constant   uint64_t & nb01,
+        constant   uint64_t & nb02,
+        constant    int64_t & ne12,
+        constant   uint64_t & nb10,
+        constant   uint64_t & nb11,
+        constant   uint64_t & nb12,
+        constant    int64_t & ne0,
+                    int64_t   ne1,
+        constant       uint & r2,
+        constant       uint & r3,
+        threadgroup   uchar * shared_memory,
+        uint3                 tgpig[[threadgroup_position_in_grid]],
+        uint                  tiitg[[thread_index_in_threadgroup]],
+        uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    threadgroup half  * sa = (threadgroup half  *)(shared_memory);
+    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
+
+    const uint r0 = tgpig.y;
+    const uint r1 = tgpig.x;
+    const uint im = tgpig.z;
+
+    if (r1 * BLOCK_SIZE_N >= ne1) return;
+
+    // if this block is of 64x32 shape or smaller
+    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
+
+    // a thread shouldn't load data outside of the matrix
+    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+
+    simdgroup_half8x8  ma[4];
+    simdgroup_float8x8 mb[2];
+    simdgroup_float8x8 c_res[8];
+    for (int i = 0; i < 8; i++){
+        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+    }
+
+    short il = (tiitg % THREAD_PER_ROW);
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02);
+    ushort offset1 = il/nl;
+
+    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
+    device const float   * y = (device const float   *)(src1
+        + nb12 * im
+        + nb11 * src1ids[r1 * BLOCK_SIZE_N + thread_col]
+        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+
+    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
+        // load data and store to threadgroup memory
+        half4x4 temp_a;
+        dequantize_func(x, il, temp_a);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        for (int i = 0; i < 16; i++) {
+            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
+            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
+            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
+        }
+
+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
+
+        il = (il + 2 < nl) ? il + 2 : il % 2;
+        x  = (il < 2) ? x + (2+nl-1)/nl : x;
+        y += BLOCK_SIZE_K;
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // load matrices from threadgroup memory and conduct outer products
+        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
+        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
+
+        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+            for (int i = 0; i < 4; i++) {
+                simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i);
+            }
+            simdgroup_barrier(mem_flags::mem_none);
+            for (int i = 0; i < 2; i++) {
+                simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i);
+            }
+
+            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
+            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
+
+            for (int i = 0; i < 8; i++){
+                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
+            }
+        }
+    }
+
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
+                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
+        for (int i = 0; i < 8; i++) {
+            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        device float * C = dst + (BLOCK_SIZE_M * r0) + im*ne1*ne0;
+        if (sgitg == 0) {
+            for (int i = 0; i < n_rows; i++) {
+                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
+                    *(C + i + src1ids[j + r1*BLOCK_SIZE_N] * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
+                }
+            }
+        }
+    }
+}
+
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
 kernel void kernel_mul_mm(device const  uchar * src0,
                          device const  uchar * src1,
@@ -4019,7 +4173,7 @@ template<typename block_q, short nl, void (*dequantize_func)(device const block_
 kernel void kernel_mul_mm_id(
        device const   uchar * ids,
        device const   uchar * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne02,
@@ -4048,18 +4202,28 @@ kernel void kernel_mul_mm_id(
        uint3                  tgpig[[threadgroup_position_in_grid]],
        uint                   tiitg[[thread_index_in_threadgroup]],
        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const uchar * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
+    device const uchar * src0s[8] = {src00, src01, src02, src03, src04, src05, src06, src07};

-    const int64_t bid = tgpig.z/(ne12*ne13);
+    // expert id
+    const int32_t id = tgpig.z/(ne12*ne13);

    tgpig.z = tgpig.z%(ne12*ne13);

-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
+    // row indices of src1 for expert id
+    int64_t _ne1 = 0;
+    short src1ids[512];

-    kernel_mul_mm_impl<block_q, nl, dequantize_func>(
-        src0[id],
-        src1 + bid*nb11,
-        (device float *) (dst + bid*nb1),
+    for (int64_t i1 = 0; i1 < ne1; i1++) {
+        if (((device int32_t *) (ids + i1*nbi1))[idx] == id) {
+            src1ids[_ne1++] = i1;
+        }
+    }
+
+    kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
+        src0s[id],
+        src1,
+        src1ids,
+        dst,
        ne00,
        ne02,
        nb01,
@@ -4069,7 +4233,7 @@ kernel void kernel_mul_mm_id(
        nb11,
        nb12,
        ne0,
-        ne1,
+        _ne1,
        r2,
        r3,
        shared_memory,
@@ -4158,7 +4322,7 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
 typedef void (mat_mm_id_t)(
        device const   uchar * ids,
        device const   uchar * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne02,
@@ -4207,7 +4371,7 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu
 kernel void kernel_mul_mv_id_f32_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4251,7 +4415,7 @@ kernel void kernel_mul_mv_id_f32_f32(
    kernel_mul_mv_f32_f32_impl(
        src0[id],
        src1 + bid*nb11,
-        (device float *) (dst + bid*nb1),
+        dst  + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4276,7 +4440,7 @@ kernel void kernel_mul_mv_id_f32_f32(
 kernel void kernel_mul_mv_id_f16_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4320,7 +4484,7 @@ kernel void kernel_mul_mv_id_f16_f32(
    kernel_mul_mv_f16_f32_impl(
        src0[id],
        src1 + bid*nb11,
-        (device float *) (dst + bid*nb1),
+        dst  + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4345,7 +4509,7 @@ kernel void kernel_mul_mv_id_f16_f32(
 kernel void kernel_mul_mv_id_q8_0_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4389,7 +4553,7 @@ kernel void kernel_mul_mv_id_q8_0_f32(
    kernel_mul_mv_q8_0_f32_impl(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4408,7 +4572,7 @@ kernel void kernel_mul_mv_id_q8_0_f32(
 kernel void kernel_mul_mv_id_q4_0_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4452,7 +4616,7 @@ kernel void kernel_mul_mv_id_q4_0_f32(
    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4471,7 +4635,7 @@ kernel void kernel_mul_mv_id_q4_0_f32(
 kernel void kernel_mul_mv_id_q4_1_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4515,7 +4679,7 @@ kernel void kernel_mul_mv_id_q4_1_f32(
    mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4534,7 +4698,7 @@ kernel void kernel_mul_mv_id_q4_1_f32(
 kernel void kernel_mul_mv_id_q5_0_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4578,7 +4742,7 @@ kernel void kernel_mul_mv_id_q5_0_f32(
    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4597,7 +4761,7 @@ kernel void kernel_mul_mv_id_q5_0_f32(
 kernel void kernel_mul_mv_id_q5_1_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4641,7 +4805,7 @@ kernel void kernel_mul_mv_id_q5_1_f32(
    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4660,7 +4824,7 @@ kernel void kernel_mul_mv_id_q5_1_f32(
 kernel void kernel_mul_mv_id_q2_K_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4704,7 +4868,7 @@ kernel void kernel_mul_mv_id_q2_K_f32(
    kernel_mul_mv_q2_K_f32_impl(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4723,7 +4887,7 @@ kernel void kernel_mul_mv_id_q2_K_f32(
 kernel void kernel_mul_mv_id_q3_K_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4767,7 +4931,7 @@ kernel void kernel_mul_mv_id_q3_K_f32(
    kernel_mul_mv_q3_K_f32_impl(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4786,7 +4950,7 @@ kernel void kernel_mul_mv_id_q3_K_f32(
 kernel void kernel_mul_mv_id_q4_K_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4830,7 +4994,7 @@ kernel void kernel_mul_mv_id_q4_K_f32(
    kernel_mul_mv_q4_K_f32_impl(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4849,7 +5013,7 @@ kernel void kernel_mul_mv_id_q4_K_f32(
 kernel void kernel_mul_mv_id_q5_K_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4893,7 +5057,7 @@ kernel void kernel_mul_mv_id_q5_K_f32(
    kernel_mul_mv_q5_K_f32_impl(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
@@ -4912,7 +5076,7 @@ kernel void kernel_mul_mv_id_q5_K_f32(
 kernel void kernel_mul_mv_id_q6_K_f32(
        device const    char * ids,
        device const    char * src1,
-        device         uchar * dst,
+        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -4956,7 +5120,7 @@ kernel void kernel_mul_mv_id_q6_K_f32(
    kernel_mul_mv_q6_K_f32_impl(
        src0[id],
        (device const float *) (src1 + bid*nb11),
-        (device       float *) ( dst + bid*nb1),
+        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
--- a/ggml.c
+++ b/ggml.c
@@ -4766,8 +4766,11 @@ struct ggml_tensor * ggml_get_rows(
    }

    // TODO: implement non F32 return
-    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
+    enum ggml_type type = GGML_TYPE_F32;
+    if (a->type == GGML_TYPE_I32) {
+        type = a->type;
+    }
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);

    result->op   = GGML_OP_GET_ROWS;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6938,14 +6941,165 @@ static void ggml_compute_forward_dup_f32(
    }
 }

+// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
+static void ggml_compute_forward_dup_bytes(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+    GGML_ASSERT(src0->type == dst->type);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+        ggml_compute_forward_dup_same_cont(params, src0, dst);
+        return;
+    }
+
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
+    const size_t type_size = ggml_type_size(src0->type);
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == type_size && nb0 == type_size) {
+        // copy by rows
+        const size_t rs = ne00 * type_size;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    if (ggml_is_contiguous(dst)) {
+        size_t id = 0;
+        char * dst_ptr = (char *) dst->data;
+        const size_t rs = ne00 * type_size;
+
+        if (nb00 == type_size) {
+            // src0 is contigous on first dimension, copy by rows
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    id += rs * ir0;
+                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                        memcpy(dst_ptr + id, src0_ptr, rs);
+                        id += rs;
+                    }
+                    id += rs * (ne01 - ir1);
+                }
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    id += rs * ir0;
+                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                        for (int64_t i00 = 0; i00 < ne00; i00++) {
+                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, type_size);
+
+                            id += type_size;
+                        }
+                    }
+                    id += rs * (ne01 - ir1);
+                }
+            }
+        }
+
+        return;
+    }
+
+    // dst counters
+
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            i10 += ne00 * ir0;
+            while (i10 >= ne0) {
+                i10 -= ne0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+            for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                    memcpy(dst_ptr, src0_ptr, type_size);
+
+                    if (++i10 == ne0) {
+                        i10 = 0;
+                        if (++i11 == ne1) {
+                            i11 = 0;
+                            if (++i12 == ne2) {
+                                i12 = 0;
+                                if (++i13 == ne3) {
+                                    i13 = 0;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            i10 += ne00 * (ne01 - ir1);
+            while (i10 >= ne0) {
+                i10 -= ne0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 static void ggml_compute_forward_dup(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, src0, dst);
+    if (src0->type == dst->type) {
+        ggml_compute_forward_dup_bytes(params, src0, dst);
        return;
    }
+
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
@@ -8404,10 +8558,12 @@ static void ggml_compute_forward_repeat(
        struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F16:
+        case GGML_TYPE_I16:
            {
                ggml_compute_forward_repeat_f16(params, src0, dst);
            } break;
        case GGML_TYPE_F32:
+        case GGML_TYPE_I32:
            {
                ggml_compute_forward_repeat_f32(params, src0, dst);
            } break;
@@ -8550,6 +8706,7 @@ static void ggml_compute_forward_concat(
    struct ggml_tensor* dst) {
    switch (src0->type) {
        case GGML_TYPE_F32:
+        case GGML_TYPE_I32:
            {
                ggml_compute_forward_concat_f32(params, src0, src1, dst);
            } break;
@@ -9547,10 +9704,10 @@ static void ggml_compute_forward_group_norm(
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
+static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
    //const int64_t ne00 = src0->ne[0];
    //const int64_t ne01 = src0->ne[1];

@@ -9630,7 +9787,7 @@ static void ggml_compute_forward_mul_mat(
 #endif

 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
        if (params->ith != 0) {
            return;
        }
@@ -10674,6 +10831,7 @@ static void ggml_compute_forward_get_rows(
                ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
            } break;
        case GGML_TYPE_F32:
+        case GGML_TYPE_I32:
            {
                ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
            } break;
@@ -16143,24 +16301,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {

                //n_tasks = MIN(n_threads, MAX(1, nr0/128));
                //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
-
-#if defined(GGML_USE_CUBLAS)
-                if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
-                    n_tasks = 1; // TODO: this actually is doing nothing
-                                 //       the threads are still spinning
-                }
-#elif defined(GGML_USE_CLBLAST)
-                if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
-                    n_tasks = 1; // TODO: this actually is doing nothing
-                                 //       the threads are still spinning
-                }
-#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
-                    n_tasks = 1; // TODO: this actually is doing nothing
-                                 //       the threads are still spinning
-                }
-#endif
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
@@ -16333,6 +16473,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            state->shared->node_n += 1;
            return (thread_ret_t) GGML_EXIT_ABORTED;
        }
+
        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            // all other threads are finished and spinning
            // do finalize and init here so we don't have synchronize again
@@ -16398,14 +16539,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        } else {
            // wait for other threads to finish
            const int last = node_n;
+
+            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
+
            while (true) {
                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
                //       depending on the workload and the operating system.
                //       since it is not clear what is the best approach, it should potentially become user-configurable
                //       ref: https://github.com/ggerganov/ggml/issues/291
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                sched_yield();
-#endif
+                // UPD:  adding the do_yield flag seems to resolve the issue universally
+                if (do_yield) {
+                    sched_yield();
+                }

                node_n = atomic_load(&state->shared->node_n);
                if (node_n != last) break;
@@ -16484,7 +16629,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    } else
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                    if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
+                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
                        if (node->src[0]->type != GGML_TYPE_F32) {
                            // here we need memory just for single 2D matrix from src0
                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -27,7 +27,7 @@ echo "Syncing ggml changes since commit $lc"
 cd $SRC_GGML

 git log --oneline $lc..HEAD
-git log --oneline $lc..HEAD | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits
+git log --oneline $lc..HEAD --reverse | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits

 if [ ! -s $SRC_LLAMA/ggml-commits ]; then
    rm -v $SRC_LLAMA/ggml-commits
@@ -87,7 +87,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml-impl.h             -> ggml-impl.h
    # src/ggml-metal.h            -> ggml-metal.h
    # src/ggml-metal.m            -> ggml-metal.m
-    # src/ggml-metal.metal        -> ggml-metal.metal
    # src/ggml-mpi.h              -> ggml-mpi.h
    # src/ggml-mpi.c              -> ggml-mpi.c
    # src/ggml-opencl.cpp         -> ggml-opencl.cpp
@@ -114,7 +113,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
        -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
        -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-        -e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \
        -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
        -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
        -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-df098ea908764cba4a4889a1cbe7b026b2d31a14
+3fd01e00e40583ccd4b393a7c6502d6a4455a1d5
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -58,6 +58,9 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
        int64_t hist[16];
        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
+    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
+        // This is going to create some weird integers though.
+        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
    } else {
        GGML_ASSERT(false);
    }
@@ -87,8 +90,13 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                        tv.push_back(*(float *) &buf[i]);
                    } else if (t->type == GGML_TYPE_I32) {
                        tv.push_back((float)*(int32_t *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I16) {
+                        tv.push_back((float)*(int16_t *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I8) {
+                        tv.push_back((float)*(int8_t *) &buf[i]);
                    } else if (quantized) {
-                        tt.to_float(&buf[i], vq.data(), bs);
+                        std::vector<float> vq(ggml_blck_size(t->type));
+                        tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
                        tv.insert(tv.end(), vq.begin(), vq.end());
                    } else {
                        GGML_ASSERT(false);
@@ -384,15 +392,21 @@ struct test_case {
        struct callback_userdata {
            bool   ok;
            double max_err;
+            ggml_backend_t backend1;
+            ggml_backend_t backend2;
        };

        callback_userdata ud {
            true,
            max_nmse_err(),
+            backend1,
+            backend2
        };

        auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
            callback_userdata * ud = (callback_userdata *) user_data;
+            const char * bn1 = ggml_backend_name(ud->backend1);
+            const char * bn2 = ggml_backend_name(ud->backend2);

            if (t1->op == GGML_OP_NONE) {
                // sentinels must be unchanged
@@ -414,7 +428,7 @@ struct test_case {
            for (size_t i = 0; i < f1.size(); i++) {
                // check for nans
                if (std::isnan(f1[i]) || std::isnan(f2[i])) {
-                    printf("[%s] NaN at index %zu (%f %f) ", ggml_op_desc(t1), i, f1[i], f2[i]);
+                    printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
                    ud->ok = false;
                    return true;
                }
@@ -422,12 +436,12 @@ struct test_case {
                if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
                    if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
                        if (std::signbit(f1[i]) != std::signbit(f2[i])) {
-                            printf("[%s] inf sign mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
+                            printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
                            ud->ok = false;
                            return true;
                        }
                    } else {
-                        printf("[%s] inf mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
+                        printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
                        ud->ok = false;
                        return true;
                    }
@@ -661,17 +675,26 @@ struct test_repeat : public test_case {
 struct test_dup : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute;
+    bool _use_permute;

    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
+        std::string v = VARS_TO_STR2(type, ne);
+        if (_use_permute) v += "," + VAR_TO_STR(permute);
+        return v;
    }

    test_dup(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1})
-        : type(type), ne(ne) {}
+            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> permute = {0, 0, 0, 0})
+        : type(type), ne(ne), permute(permute),
+            _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        if (_use_permute) {
+            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
+        }
        ggml_tensor * out = ggml_dup(ctx, src);
        return out;
    }
@@ -1450,14 +1473,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
            }
        }
    }
+    for (int b : {1, 7}) {
+        for (bool v : {false, true}) {
+            test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
+        }
+    }

    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
+    test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
+    test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));

-    test_cases.emplace_back(new test_dup());
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));

    for (ggml_type type : all_types) {
       test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
@@ -1565,7 +1600,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op

    test_cases.emplace_back(new test_alibi());
    test_cases.emplace_back(new test_im2col());
-    test_cases.emplace_back(new test_concat());
+    test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
+    test_cases.emplace_back(new test_concat(GGML_TYPE_I32));

    for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
Author	SHA1	Message	Date
Georgi Gerganov	c1d7cb28d3	ggml : do not sched_yield when calling BLAS (#4761 ) * ggml : do not sched_yield when calling BLAS ggml-ci * ggml : fix do_yield logic ggml-ci * ggml : simplify do_yield logic ggml-ci	2024-01-05 15:18:21 +02:00
Georgi Gerganov	3681f22443	examples : add few-shot translation example (#4783 )	2024-01-05 15:11:10 +02:00
Daniel Bevenius	b3a7c20b5c	finetune : remove unused includes (#4756 ) This commit removes unused includes from finetune.cpp. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-01-04 21:45:37 +02:00
Georgi Gerganov	012cf349ae	server : send token probs for "stream == false" (#4714 )	2024-01-04 19:56:33 +02:00
Johannes Gäßler	a91928014f	Print backend name on test-backend-ops failure (#4751 )	2024-01-04 09:43:23 +01:00
singularity	3c0b585561	llama.swiftui : support loading custom model from file picker (#4767 ) * swiftui: support load model from file picker * swiftui: remove trailing whitespace	2024-01-04 10:22:38 +02:00
Michael Coppola	e5804313a1	server : fix options in README.md (#4765 ) * fix examples/server/README.md * minor : fix whitespace --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-01-04 10:17:09 +02:00
Georgi Gerganov	dc891b7f7a	ggml : include stdlib.h before intrin.h (#4736 )	2024-01-04 10:12:26 +02:00
singularity	46cea79e1f	llama.swiftui : fix build of ggml.metallib (#4754 ) * metal: fix metal backend init failure in swiftui * metal: build ggml.metallib instead of copy src * llama.swift : remove debug flags from metallib build --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-01-04 09:58:16 +02:00
Daniel Bevenius	cb1e2818e0	train : fix typo in overlapping-samples help msg (#4758 ) This commit fixes a typo in the help message for the --overlapping-samples option. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-01-03 19:53:40 +02:00
Ashraful Islam	ece9a45e8f	swift : update Package.swift to use ggml as dependency (#4691 ) * updates the package.swift to use ggml as dependency * changes the ggml package url src to ggerganov	2024-01-03 19:30:02 +02:00
Georgi Gerganov	7bed7eba35	cuda : simplify expression Co-authored-by: slaren <slarengh@gmail.com>	2024-01-03 14:38:38 +02:00
Georgi Gerganov	d55356d3ba	cuda : mark I16 and I32 ops as unsupported ggml-ci	2024-01-03 14:38:38 +02:00
Georgi Gerganov	75e3fd8581	sync : ggml ggml-ci	2024-01-03 14:38:38 +02:00
Georgi Gerganov	289313716f	metal : add kernel_get_rows_i32 ggml-ci	2024-01-03 14:38:38 +02:00
Georgi Gerganov	ab62fc3e55	scripts : fix sync order + metal sed	2024-01-03 14:38:38 +02:00
Guillaume Wenzek	5f66ebca9c	ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639) * add more int ops * ggml_compute_forward_dup_bytes * add tests * PR comments * tests : minor indentations --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-01-03 14:38:38 +02:00
Justin Parker	f2eb19bd8b	server : throw an error when `slot unavailable` (#4741 )	2024-01-03 10:43:19 +02:00
Georgi Gerganov	f3f62f0d83	metal : optimize ggml_mul_mat_id (faster Mixtral PP) (#4725 ) * ggml : disable fast-math for Metal (cmake build only) ggml-ci * metal : fix Metal API debug warnings * cmake : add -fno-inline for Metal build (#4545) * metal : fix API debug warnings * metal : fix compile warnings * metal : use uint64_t for strides * cmake : rename option to LLAMA_METAL_SHADER_DEBUG * metal : fix mat-vec Q8_0 kernel for BS > 1 * metal : normalize mat-vec kernel signatures * cmake : respect LLAMA_QKK_64 option * metal : fix mat-vec Q4_K kernel for QK_K == 64 * metal : optimizing ggml_mul_mat_id (wip) * metal : minor fix * metal : opt mul_mm_id	2024-01-02 21:07:47 +02:00