HIP: adjust RDNA3.5 MMQ kernel selction logic (#18666 )

cmake : update blas logic (#18205 )
server : adjust unified KV cache tests (#18716 )
2026-05-02 15:14:06 +00:00 · 2026-01-10 17:19:01 +01:00 · 2026-01-10 18:00:54 +02:00 · 2026-01-10 17:51:56 +02:00 · 2026-01-10 16:04:05 +01:00
5 changed files with 36 additions and 23 deletions
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -32,14 +32,12 @@ if (BLAS_FOUND)
                pkg_check_modules(DepBLAS openblas)
            endif()
        elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
-            add_compile_definitions(GGML_BLAS_USE_BLIS)
            pkg_check_modules(DepBLAS blis)
        elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
            pkg_check_modules(DepBLAS blas-atlas)
        elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
            pkg_check_modules(DepBLAS flexiblas_api)
        elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
-            add_compile_definitions(GGML_BLAS_USE_MKL)
            # all Intel* libraries share the same include path
            pkg_check_modules(DepBLAS mkl-sdl)
        elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
@@ -74,10 +72,26 @@ if (BLAS_FOUND)

    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})

-    if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+    if ("${GGML_BLAS_VENDOR}" STREQUAL "")
+        message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.")
+    endif()
+
+    if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic"))
        add_compile_definitions(GGML_BLAS_USE_MKL)
    endif()

+    if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS")
+        add_compile_definitions(GGML_BLAS_USE_OPENBLAS)
+    endif()
+
+    if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt")
+        add_compile_definitions(GGML_BLAS_USE_BLIS)
+    endif()
+
+    if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL")
+        add_compile_definitions(GGML_BLAS_USE_NVPL)
+    endif()
+
    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
 else()
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -115,15 +115,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 #endif
    }

-#if defined(OPENBLAS_VERSION)
+#if defined(GGML_BLAS_USE_OPENBLAS)
    openblas_set_num_threads(ctx->n_threads);
-#endif
-
-#if defined(GGML_BLAS_USE_BLIS)
+#elif defined(GGML_BLAS_USE_BLIS)
    bli_thread_set_num_threads(ctx->n_threads);
-#endif
-
-#if defined(GGML_BLAS_USE_NVPL)
+#elif defined(GGML_BLAS_USE_NVPL)
    nvpl_blas_set_num_threads(ctx->n_threads);
 #endif

@@ -288,7 +284,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
        /* .context = */ ctx,
    };

-#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
    if (openblas_get_parallel() != OPENBLAS_OPENMP) {
        GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
    }
@@ -329,7 +325,7 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t
        return "BLIS";
    #elif defined(GGML_BLAS_USE_NVPL)
        return "NVPL";
-    #elif defined(OPENBLAS_VERSION)
+    #elif defined(GGML_BLAS_USE_OPENBLAS)
        return "OpenBLAS";
    #else
        return "BLAS";
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -333,28 +333,31 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
    }

    if (amd_wmma_available(cc)) {
-        // RDNA 4 is consistently worse on rocblas
-        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
        if (GGML_CUDA_CC_IS_RDNA3(cc)) {
-            // High expert counts almost always better on MMQ
-            // due to a large amount of graph splits
+            // High expert counts are almost always better on MMQ due to
+            //     the synchronization overhead in the cuBLAS/hipBLAS path:
            // https://github.com/ggml-org/llama.cpp/pull/18202
            if (n_experts >= 64) {
                return true;
            }

+            // For some quantization types MMQ can have lower peak TOPS than hipBLAS
+            //     so it's only faster for sufficiently small batch sizes:
            switch (type) {
-                // These quants are really bad on MMQ
                case GGML_TYPE_Q2_K:
+                    return ne11 <= 128;
                case GGML_TYPE_Q6_K:
-                // These quants are usually worse but not always
+                    return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
                case GGML_TYPE_IQ2_XS:
                case GGML_TYPE_IQ2_S:
-                    return ne11 <= 128;
+                    return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
                default:
                    return true;
            }
        }
+
+        // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
+        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
        return true;
    }

--- a/scripts/pr2wt.sh
+++ b/scripts/pr2wt.sh
@@ -40,7 +40,7 @@ org_repo=${org_repo%.git}

 echo "org/repo: $org_repo"

-meta=$(curl -sSf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR")
+meta=$(curl -sSLf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR")

 url_remote=$(echo "$meta" | jq -r '.head.repo.clone_url')
 head_ref=$(echo "$meta" | jq -r '.head.ref')
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
    for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
        if expect_ok:
            assert res.status_code == 200
+
+        # note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
+        if res.status_code == 200:
            assert "content" in res.body
            if "timings" in res.body:
                assert res.body["timings"]["predicted_n"] == n_predict
-        else:
-            assert res.status_code == 500
-            assert "content" not in res.body


@pytest.mark.parametrize(
Author	SHA1	Message	Date
Johannes Gäßler	d2ff4e23ac	HIP: adjust RDNA3.5 MMQ kernel selction logic (#18666 )	2026-01-10 17:19:01 +01:00
Perry Naseck	657a2e644b	cmake : update blas logic (#18205 )	2026-01-10 18:00:54 +02:00
Georgi Gerganov	f307926482	server : adjust unified KV cache tests (#18716 )	2026-01-10 17:51:56 +02:00
Sigbjørn Skjæret	7fdc8c893d	scripts : follow api redirects in pr2wt.sh (#18739 )	2026-01-10 16:04:05 +01:00