Compare commits

...

4 Commits
b1215 ... b1219

Author SHA1 Message Date
Johannes Gäßler
0a5eebb45d CUDA: mul_mat_q RDNA2 tunings (#2910)
* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
2023-09-13 11:20:24 +02:00
FK
84e723653c speculative: add --n-gpu-layers-draft option (#3063) 2023-09-13 08:50:46 +02:00
Eric Sommerlade
b52b29ab9d arm64 support for windows (#3007)
Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
2023-09-12 21:54:20 -04:00
Johannes Gäßler
4f7cd6ba9c CUDA: fix LoRAs (#3130) 2023-09-13 00:15:33 +02:00
9 changed files with 476 additions and 62 deletions

View File

@@ -388,7 +388,6 @@ if (LLAMA_HIPBLAS)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
@@ -461,6 +460,13 @@ endif()
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (MSVC)
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
else ()
set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif ()
if (NOT MSVC)
if (LLAMA_STATIC)
add_link_options(-static)
@@ -476,10 +482,14 @@ if (NOT MSVC)
endif()
endif()
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64"))
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
message(STATUS "ARM detected")
if (MSVC)
# TODO: arm msvc?
add_compile_definitions(__ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
else()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
@@ -494,7 +504,7 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX512)

View File

@@ -408,7 +408,6 @@ ifdef LLAMA_HIPBLAS
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
HIPFLAGS += -DCC_TURING=1000000000
ifdef LLAMA_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV

View File

@@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers_draft = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
@@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
printf(" -ngld N, --n-gpu-layers-draft N\n");
printf(" number of layers to store in VRAM for the draft model\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");

View File

@@ -38,6 +38,7 @@ struct gpt_params {
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.

View File

@@ -42,6 +42,7 @@ int main(int argc, char ** argv) {
// load the draft model
params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
// tokenize the prompt

View File

@@ -13,7 +13,7 @@
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif
#endif // __HIP_PLATFORM_AMD__
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@@ -68,19 +68,29 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif
#endif // defined(GGML_USE_HIPBLAS)
#include "ggml-cuda.h"
#include "ggml.h"
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#ifndef CC_TURING
#define CC_TURING 700
#endif
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_TURING 700
#define CC_OFFSET_AMD 1000000
#define CC_RDNA2 CC_OFFSET_AMD + 1030
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
@@ -132,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#endif
return c;
}
#endif
#endif // defined(GGML_USE_HIPBLAS)
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -3472,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
}
}
#define MMQ_X_Q4_0_RDNA2 64
#define MMQ_Y_Q4_0_RDNA2 128
#define NWARPS_Q4_0_RDNA2 8
#define MMQ_X_Q4_0_RDNA1 64
#define MMQ_Y_Q4_0_RDNA1 64
#define NWARPS_Q4_0_RDNA1 8
#define MMQ_X_Q4_0_AMPERE 64
#define MMQ_Y_Q4_0_AMPERE 128
#define NWARPS_Q4_0_AMPERE 4
@@ -3479,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
#define MMQ_Y_Q4_0_PASCAL 64
#define NWARPS_Q4_0_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q4_0(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q4_0(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q4_0_RDNA2;
const int mmq_y = MMQ_Y_Q4_0_RDNA2;
const int nwarps = NWARPS_Q4_0_RDNA2;
#else
const int mmq_x = MMQ_X_Q4_0_RDNA1;
const int mmq_y = MMQ_Y_Q4_0_RDNA1;
const int nwarps = NWARPS_Q4_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q4_0_AMPERE;
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3506,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q4_1_RDNA2 64
#define MMQ_Y_Q4_1_RDNA2 128
#define NWARPS_Q4_1_RDNA2 8
#define MMQ_X_Q4_1_RDNA1 64
#define MMQ_Y_Q4_1_RDNA1 64
#define NWARPS_Q4_1_RDNA1 8
#define MMQ_X_Q4_1_AMPERE 64
#define MMQ_Y_Q4_1_AMPERE 128
#define NWARPS_Q4_1_AMPERE 4
@@ -3514,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
#define NWARPS_Q4_1_PASCAL 8
template <bool need_check> static __global__ void
#if __CUDA_ARCH__ < CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_TURING
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q4_1(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q4_1_RDNA2;
const int mmq_y = MMQ_Y_Q4_1_RDNA2;
const int nwarps = NWARPS_Q4_1_RDNA2;
#else
const int mmq_x = MMQ_X_Q4_1_RDNA1;
const int mmq_y = MMQ_Y_Q4_1_RDNA1;
const int nwarps = NWARPS_Q4_1_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q4_1_AMPERE;
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3544,6 +3606,12 @@ template <bool need_check> static __global__ void
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q5_0_RDNA2 64
#define MMQ_Y_Q5_0_RDNA2 128
#define NWARPS_Q5_0_RDNA2 8
#define MMQ_X_Q5_0_RDNA1 64
#define MMQ_Y_Q5_0_RDNA1 64
#define NWARPS_Q5_0_RDNA1 8
#define MMQ_X_Q5_0_AMPERE 128
#define MMQ_Y_Q5_0_AMPERE 64
#define NWARPS_Q5_0_AMPERE 4
@@ -3551,11 +3619,32 @@ template <bool need_check> static __global__ void
#define MMQ_Y_Q5_0_PASCAL 64
#define NWARPS_Q5_0_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q5_0(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_0(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q5_0_RDNA2;
const int mmq_y = MMQ_Y_Q5_0_RDNA2;
const int nwarps = NWARPS_Q5_0_RDNA2;
#else
const int mmq_x = MMQ_X_Q5_0_RDNA1;
const int mmq_y = MMQ_Y_Q5_0_RDNA1;
const int nwarps = NWARPS_Q5_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q5_0_AMPERE;
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3578,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q5_1_RDNA2 64
#define MMQ_Y_Q5_1_RDNA2 128
#define NWARPS_Q5_1_RDNA2 8
#define MMQ_X_Q5_1_RDNA1 64
#define MMQ_Y_Q5_1_RDNA1 64
#define NWARPS_Q5_1_RDNA1 8
#define MMQ_X_Q5_1_AMPERE 128
#define MMQ_Y_Q5_1_AMPERE 64
#define NWARPS_Q5_1_AMPERE 4
@@ -3585,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
#define MMQ_Y_Q5_1_PASCAL 64
#define NWARPS_Q5_1_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q5_1(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_1(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q5_1_RDNA2;
const int mmq_y = MMQ_Y_Q5_1_RDNA2;
const int nwarps = NWARPS_Q5_1_RDNA2;
#else
const int mmq_x = MMQ_X_Q5_1_RDNA1;
const int mmq_y = MMQ_Y_Q5_1_RDNA1;
const int nwarps = NWARPS_Q5_1_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q5_1_AMPERE;
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3612,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q8_0_RDNA2 64
#define MMQ_Y_Q8_0_RDNA2 128
#define NWARPS_Q8_0_RDNA2 8
#define MMQ_X_Q8_0_RDNA1 64
#define MMQ_Y_Q8_0_RDNA1 64
#define NWARPS_Q8_0_RDNA1 8
#define MMQ_X_Q8_0_AMPERE 128
#define MMQ_Y_Q8_0_AMPERE 64
#define NWARPS_Q8_0_AMPERE 4
@@ -3619,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
#define MMQ_Y_Q8_0_PASCAL 64
#define NWARPS_Q8_0_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q8_0(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q8_0(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q8_0_RDNA2;
const int mmq_y = MMQ_Y_Q8_0_RDNA2;
const int nwarps = NWARPS_Q8_0_RDNA2;
#else
const int mmq_x = MMQ_X_Q8_0_RDNA1;
const int mmq_y = MMQ_Y_Q8_0_RDNA1;
const int nwarps = NWARPS_Q8_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q8_0_AMPERE;
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3646,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q2_K_RDNA2 64
#define MMQ_Y_Q2_K_RDNA2 128
#define NWARPS_Q2_K_RDNA2 8
#define MMQ_X_Q2_K_RDNA1 128
#define MMQ_Y_Q2_K_RDNA1 32
#define NWARPS_Q2_K_RDNA1 8
#define MMQ_X_Q2_K_AMPERE 64
#define MMQ_Y_Q2_K_AMPERE 128
#define NWARPS_Q2_K_AMPERE 4
@@ -3653,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
#define MMQ_Y_Q2_K_PASCAL 64
#define NWARPS_Q2_K_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q2_K(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q2_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q2_K_RDNA2;
const int mmq_y = MMQ_Y_Q2_K_RDNA2;
const int nwarps = NWARPS_Q2_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q2_K_RDNA1;
const int mmq_y = MMQ_Y_Q2_K_RDNA1;
const int nwarps = NWARPS_Q2_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q2_K_AMPERE;
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3680,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q3_K_RDNA2 128
#define MMQ_Y_Q3_K_RDNA2 64
#define NWARPS_Q3_K_RDNA2 8
#define MMQ_X_Q3_K_RDNA1 32
#define MMQ_Y_Q3_K_RDNA1 128
#define NWARPS_Q3_K_RDNA1 8
#define MMQ_X_Q3_K_AMPERE 128
#define MMQ_Y_Q3_K_AMPERE 128
#define NWARPS_Q3_K_AMPERE 4
@@ -3688,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
#define NWARPS_Q3_K_PASCAL 8
template <bool need_check> static __global__ void
#if __CUDA_ARCH__ < CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_TURING
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q3_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q3_K_RDNA2;
const int mmq_y = MMQ_Y_Q3_K_RDNA2;
const int nwarps = NWARPS_Q3_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q3_K_RDNA1;
const int mmq_y = MMQ_Y_Q3_K_RDNA1;
const int nwarps = NWARPS_Q3_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q3_K_AMPERE;
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3718,6 +3913,12 @@ template <bool need_check> static __global__ void
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q4_K_RDNA2 64
#define MMQ_Y_Q4_K_RDNA2 128
#define NWARPS_Q4_K_RDNA2 8
#define MMQ_X_Q4_K_RDNA1 32
#define MMQ_Y_Q4_K_RDNA1 64
#define NWARPS_Q4_K_RDNA1 8
#define MMQ_X_Q4_K_AMPERE 64
#define MMQ_Y_Q4_K_AMPERE 128
#define NWARPS_Q4_K_AMPERE 4
@@ -3726,14 +3927,33 @@ template <bool need_check> static __global__ void
#define NWARPS_Q4_K_PASCAL 8
template <bool need_check> static __global__ void
#if __CUDA_ARCH__ < CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_TURING
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q4_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q4_K_RDNA2;
const int mmq_y = MMQ_Y_Q4_K_RDNA2;
const int nwarps = NWARPS_Q4_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q4_K_RDNA1;
const int mmq_y = MMQ_Y_Q4_K_RDNA1;
const int nwarps = NWARPS_Q4_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q4_K_AMPERE;
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -3756,6 +3976,12 @@ template <bool need_check> static __global__ void
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q5_K_RDNA2 64
#define MMQ_Y_Q5_K_RDNA2 128
#define NWARPS_Q5_K_RDNA2 8
#define MMQ_X_Q5_K_RDNA1 32
#define MMQ_Y_Q5_K_RDNA1 64
#define NWARPS_Q5_K_RDNA1 8
#define MMQ_X_Q5_K_AMPERE 64
#define MMQ_Y_Q5_K_AMPERE 128
#define NWARPS_Q5_K_AMPERE 4
@@ -3763,11 +3989,32 @@ template <bool need_check> static __global__ void
#define MMQ_Y_Q5_K_PASCAL 64
#define NWARPS_Q5_K_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q5_K(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q5_K_RDNA2;
const int mmq_y = MMQ_Y_Q5_K_RDNA2;
const int nwarps = NWARPS_Q5_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q5_K_RDNA1;
const int mmq_y = MMQ_Y_Q5_K_RDNA1;
const int nwarps = NWARPS_Q5_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q5_K_AMPERE;
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -3790,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q6_K_RDNA2 64
#define MMQ_Y_Q6_K_RDNA2 128
#define NWARPS_Q6_K_RDNA2 8
#define MMQ_X_Q6_K_RDNA1 32
#define MMQ_Y_Q6_K_RDNA1 64
#define NWARPS_Q6_K_RDNA1 8
#define MMQ_X_Q6_K_AMPERE 64
#define MMQ_Y_Q6_K_AMPERE 64
#define NWARPS_Q6_K_AMPERE 4
@@ -3798,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
#define NWARPS_Q6_K_PASCAL 8
template <bool need_check> static __global__ void
#if __CUDA_ARCH__ < CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_TURING
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q6_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q6_K_RDNA2;
const int mmq_y = MMQ_Y_Q6_K_RDNA2;
const int nwarps = NWARPS_Q6_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q6_K_RDNA1;
const int mmq_y = MMQ_Y_Q6_K_RDNA1;
const int nwarps = NWARPS_Q6_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q6_K_AMPERE;
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4588,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q4_0_RDNA2;
mmq_y = MMQ_Y_Q4_0_RDNA2;
nwarps = NWARPS_Q4_0_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q4_0_RDNA1;
mmq_y = MMQ_Y_Q4_0_RDNA1;
nwarps = NWARPS_Q4_0_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q4_0_AMPERE;
mmq_y = MMQ_Y_Q4_0_AMPERE;
nwarps = NWARPS_Q4_0_AMPERE;
@@ -4625,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q4_1_RDNA2;
mmq_y = MMQ_Y_Q4_1_RDNA2;
nwarps = NWARPS_Q4_1_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q4_1_RDNA1;
mmq_y = MMQ_Y_Q4_1_RDNA1;
nwarps = NWARPS_Q4_1_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q4_1_AMPERE;
mmq_y = MMQ_Y_Q4_1_AMPERE;
nwarps = NWARPS_Q4_1_AMPERE;
@@ -4662,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q5_0_RDNA2;
mmq_y = MMQ_Y_Q5_0_RDNA2;
nwarps = NWARPS_Q5_0_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q5_0_RDNA1;
mmq_y = MMQ_Y_Q5_0_RDNA1;
nwarps = NWARPS_Q5_0_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q5_0_AMPERE;
mmq_y = MMQ_Y_Q5_0_AMPERE;
nwarps = NWARPS_Q5_0_AMPERE;
@@ -4699,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q5_1_RDNA2;
mmq_y = MMQ_Y_Q5_1_RDNA2;
nwarps = NWARPS_Q5_1_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q5_1_RDNA1;
mmq_y = MMQ_Y_Q5_1_RDNA1;
nwarps = NWARPS_Q5_1_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q5_1_AMPERE;
mmq_y = MMQ_Y_Q5_1_AMPERE;
nwarps = NWARPS_Q5_1_AMPERE;
@@ -4736,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q8_0_RDNA2;
mmq_y = MMQ_Y_Q8_0_RDNA2;
nwarps = NWARPS_Q8_0_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q8_0_RDNA1;
mmq_y = MMQ_Y_Q8_0_RDNA1;
nwarps = NWARPS_Q8_0_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q8_0_AMPERE;
mmq_y = MMQ_Y_Q8_0_AMPERE;
nwarps = NWARPS_Q8_0_AMPERE;
@@ -4773,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q2_K_RDNA2;
mmq_y = MMQ_Y_Q2_K_RDNA2;
nwarps = NWARPS_Q2_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q2_K_RDNA1;
mmq_y = MMQ_Y_Q2_K_RDNA1;
nwarps = NWARPS_Q2_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q2_K_AMPERE;
mmq_y = MMQ_Y_Q2_K_AMPERE;
nwarps = NWARPS_Q2_K_AMPERE;
@@ -4812,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q3_K_RDNA2;
mmq_y = MMQ_Y_Q3_K_RDNA2;
nwarps = NWARPS_Q3_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q3_K_RDNA1;
mmq_y = MMQ_Y_Q3_K_RDNA1;
nwarps = NWARPS_Q3_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q3_K_AMPERE;
mmq_y = MMQ_Y_Q3_K_AMPERE;
nwarps = NWARPS_Q3_K_AMPERE;
@@ -4850,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q4_K_RDNA2;
mmq_y = MMQ_Y_Q4_K_RDNA2;
nwarps = NWARPS_Q4_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q4_K_RDNA1;
mmq_y = MMQ_Y_Q4_K_RDNA1;
nwarps = NWARPS_Q4_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q4_K_AMPERE;
mmq_y = MMQ_Y_Q4_K_AMPERE;
nwarps = NWARPS_Q4_K_AMPERE;
@@ -4887,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q5_K_RDNA2;
mmq_y = MMQ_Y_Q5_K_RDNA2;
nwarps = NWARPS_Q5_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q5_K_RDNA1;
mmq_y = MMQ_Y_Q5_K_RDNA1;
nwarps = NWARPS_Q5_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q5_K_AMPERE;
mmq_y = MMQ_Y_Q5_K_AMPERE;
nwarps = NWARPS_Q5_K_AMPERE;
@@ -4924,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q6_K_RDNA2;
mmq_y = MMQ_Y_Q6_K_RDNA2;
nwarps = NWARPS_Q6_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q6_K_RDNA1;
mmq_y = MMQ_Y_Q6_K_RDNA1;
nwarps = NWARPS_Q6_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q6_K_AMPERE;
mmq_y = MMQ_Y_Q6_K_AMPERE;
nwarps = NWARPS_Q6_K_AMPERE;
@@ -5165,8 +5517,11 @@ void ggml_init_cublas() {
g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
#else
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
for (int64_t id = 0; id < g_device_count; ++id) {
g_tensor_split[id] /= total_vram;
@@ -5247,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
if (src->backend == GGML_BACKEND_CPU) {
kind = cudaMemcpyHostToDevice;
src_ptr = (char *) src->data;
} else if (src->backend == GGML_BACKEND_GPU) {
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
kind = cudaMemcpyDeviceToDevice;
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
int id;
@@ -5289,9 +5645,7 @@ inline void ggml_cuda_op_add(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
@@ -5452,14 +5806,41 @@ inline void ggml_cuda_op_mul_mat_q(
}
static int64_t get_row_rounding(ggml_type type) {
int max_compute_capability = INT_MIN;
for (int id = 0; id < g_device_count; ++id) {
if (max_compute_capability < g_compute_capabilities[id]
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
max_compute_capability = g_compute_capabilities[id];
int64_t min_compute_capability = INT_MAX;
int64_t max_compute_capability = INT_MIN;
for (int64_t id = 0; id < g_device_count; ++id) {
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
if (min_compute_capability > g_compute_capabilities[id]) {
min_compute_capability = g_compute_capabilities[id];
}
if (max_compute_capability < g_compute_capabilities[id]) {
max_compute_capability = g_compute_capabilities[id];
}
}
}
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
switch(type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
case GGML_TYPE_F16:
return 1;
case GGML_TYPE_Q2_K:
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
case GGML_TYPE_Q3_K:
return min_compute_capability < CC_RDNA2 ? 128 : 64;
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
default:
GGML_ASSERT(false);
}
#else
switch(type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
@@ -5480,6 +5861,7 @@ static int64_t get_row_rounding(ggml_type type) {
default:
GGML_ASSERT(false);
}
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
inline void ggml_cuda_op_mul_mat_vec_q(
@@ -5631,10 +6013,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low;
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
size_t src0_as;
float * src0_ddf_i = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as);
to_fp32_cuda(src0_dd_i, src0_ddf_i, row_diff*ne00, stream);
float * src0_ddq_as_f32;
size_t src0_as = 0;
if (src0->type != GGML_TYPE_F32) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
}
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
int id;
CUDA_CHECK(cudaGetDevice(&id));
@@ -5651,10 +6038,11 @@ inline void ggml_cuda_op_mul_mat_cublas(
src1_ddf_i, ne10,
&beta, dst_dd_i, ldc));
ggml_cuda_pool_free(src0_ddf_i, src0_as);
if (src0_as > 0) {
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
}
(void) dst;
(void) src0_dd_i;
(void) src1_ddq_i;
(void) src1_padded_row_size;
}
@@ -5793,7 +6181,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
const bool use_src1 = src1 != nullptr;
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
GGML_ASSERT( src0->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
@@ -5801,7 +6188,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU;
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;

2
ggml.c
View File

@@ -283,7 +283,7 @@ typedef double ggml_float;
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#ifdef __ARM_NEON
#if defined(__ARM_NEON) && !defined(_MSC_VER)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//

2
ggml.h
View File

@@ -270,7 +270,7 @@ extern "C" {
#if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t;
#elif defined(__ARM_NEON)
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;

View File

@@ -2609,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
memcpy(utmp, x[i].scales, 12);
const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)};
uint32x2_t mins8 = { 0 };
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[0] &= kmask1;