From fd89556567057bf64a6f6d6e50abec488929d7e0 Mon Sep 17 00:00:00 2001 From: Devedse <2350015+devedse@users.noreply.github.com> Date: Sat, 9 May 2026 07:50:24 +0200 Subject: [PATCH] [SYCL] Add BF16 support to GET_ROWS operation (#21391) Add GGML_TYPE_BF16 to the SYCL backend's GET_ROWS operation, both in supports_op and in the kernel dispatch. This fixes a performance regression where models using BF16 embedding tensors (e.g., Gemma4's per_layer_token_embd.weight) fall back to CPU for the GET_ROWS op, causing a full GPU-to-CPU tensor transfer every token. The fix reuses the existing get_rows_sycl_float template with sycl::ext::oneapi::bfloat16, matching the pattern already used for sycl::half (F16) and float (F32). --- ggml/src/ggml-sycl/getrows.cpp | 4 ++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 1 + 2 files changed, 5 insertions(+) diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp index 03f8dd9074..ca45745477 100644 --- a/ggml/src/ggml-sycl/getrows.cpp +++ b/ggml/src/ggml-sycl/getrows.cpp @@ -183,6 +183,10 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::half *)dst->src[0]->data, src1_i32, (float *)dst->data, ctx.stream()); break; + case GGML_TYPE_BF16: + get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::ext::oneapi::bfloat16 *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; case GGML_TYPE_F32: get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, src1_i32, (float *)dst->data, ctx.stream()); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f86ff3e946..b6e705cdf3 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4974,6 +4974,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g { switch (op->src[0]->type) { case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_F32: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: