Fix FATTN profiling

This commit is contained in:
Piotr Wilkin
2026-05-12 23:58:28 +02:00
parent 498c78f9f4
commit e657adbcf1
11 changed files with 395 additions and 252 deletions

View File

@@ -28,22 +28,18 @@ extern "C" {
void * profiling_context;
// callback for recording a profile record from C code (set by backend when profiling)
// params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4], ne_src2[4], type_src0, type_src1, type_src2, sub_op
void (*profiling_record_fn)(void * context,
int type,
const char * name,
int split_id,
uint64_t start_ns,
uint64_t end_ns,
uint64_t bytes,
const char * extra,
const int64_t ne_src0[4],
const int64_t ne_src1[4],
const int64_t ne_src2[4],
int type_src0,
int type_src1,
int type_src2,
int sub_op);
// The callback receives the full tensor node so it can extract all sources, types,
// op_params, and sub-op information directly.
// params: context, type, name, split_id, start_ns, end_ns, bytes, extra, node
void (*profiling_record_fn)(void * context,
int type,
const char * name,
int split_id,
uint64_t start_ns,
uint64_t end_ns,
uint64_t bytes,
const char * extra,
const struct ggml_tensor * node);
};
// numa strategies

View File

@@ -3,6 +3,8 @@
#include "ggml-backend.h"
#include "ggml.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
@@ -27,12 +29,20 @@ typedef struct ggml_profile_record {
uint64_t end_ns; // end timestamp in nanoseconds
uint64_t bytes; // bytes transferred (for copy) or tensor size (for ops)
const char * extra; // fusion name for fused ops, or NULL
int64_t ne_src0[4]; // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT)
int64_t ne_src1[4]; // src[1] tensor dimensions (e.g. input matrix for MUL_MAT)
int64_t ne_src2[4]; // src[2] tensor dimensions (e.g. ids for MUL_MAT_ID)
int type_src0; // src[0] tensor type (ggml_type), -1 if N/A
int type_src1; // src[1] tensor type (ggml_type), -1 if N/A
int type_src2; // src[2] tensor type (ggml_type), -1 if N/A
// Output tensor info
int64_t ne[4]; // output tensor dimensions
int out_type; // output tensor type (ggml_type), -1 if N/A
// Source tensors (up to GGML_MAX_SRC). n_src is the actual number populated.
int n_src;
int64_t ne_src[GGML_MAX_SRC][4]; // per-source dimensions
int64_t nb_src[GGML_MAX_SRC][4]; // per-source strides (bytes)
int type_src[GGML_MAX_SRC]; // per-source ggml_type, -1 if not present
// Operation parameters (raw bytes copied from ggml_tensor::op_params)
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
int sub_op; // sub-operation (ggml_unary_op or ggml_glu_op), -1 if N/A
} ggml_profile_record;
@@ -61,6 +71,13 @@ struct ggml_backend_profiler {
typedef struct ggml_backend_profiler * ggml_backend_profiler_t;
// Populate the per-node fields of a ggml_profile_record from a ggml_tensor node:
// ne, out_type, n_src, ne_src, nb_src, type_src, op_params, sub_op.
// All other fields (type/name/backend_id/split_id/timestamps/bytes/extra) must
// be filled in separately by the backend that records the event.
GGML_API void ggml_profile_record_from_tensor(struct ggml_profile_record * rec,
const struct ggml_tensor * node);
// Register a profiler on a backend (called by backend during init)
// The profiler is owned by the backend and will be freed when the backend is freed
GGML_API void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler);

View File

@@ -1563,6 +1563,38 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
return true;
}
// Build a COPY profiling record. Copies have no real ggml_tensor "node" backing
// them, so we synthesize one source describing the input tensor that was moved.
static ggml_profile_record make_copy_record(const char * copy_dir, int backend_id, int split_id,
uint64_t start_ns, uint64_t end_ns, uint64_t bytes,
const struct ggml_tensor * input) {
ggml_profile_record rec = {};
rec.type = GGML_PROFILE_EVENT_COPY;
rec.name = copy_dir;
rec.backend_id = backend_id;
rec.split_id = split_id;
rec.start_ns = start_ns;
rec.end_ns = end_ns;
rec.bytes = bytes;
rec.extra = input ? input->name : NULL;
rec.out_type = -1;
rec.sub_op = -1;
rec.n_src = 0;
if (input != NULL) {
// Describe the input tensor as src[0] so consumers can inspect its shape.
rec.n_src = 1;
memcpy(rec.ne_src[0], input->ne, sizeof(rec.ne_src[0]));
for (int d = 0; d < 4; d++) {
rec.nb_src[0][d] = (int64_t) input->nb[d];
}
rec.type_src[0] = (int) input->type;
}
for (int i = rec.n_src; i < GGML_MAX_SRC; i++) {
rec.type_src[i] = -1;
}
return rec;
}
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
struct ggml_backend_sched_split * splits = sched->splits;
@@ -1620,9 +1652,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
copy_dir = "copy_D2H";
}
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
copy_start, copy_end, ggml_nbytes(input), input->name,
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0}, -1, -1, -1, -1 });
sched->copy_records.push_back(make_copy_record(copy_dir, split_backend_id, split_id,
copy_start, copy_end, ggml_nbytes(input), input));
} else {
ggml_backend_tensor_copy(input, input_cpy);
}
@@ -1740,10 +1771,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
copy_dir = "copy_D2H";
}
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, moe_copy_start, moe_copy_end,
(uint64_t) total_copied_bytes, input->name,
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0}, -1, -1, -1, -1 });
sched->copy_records.push_back(make_copy_record(copy_dir, split_backend_id, split_id,
moe_copy_start, moe_copy_end,
(uint64_t) total_copied_bytes, input));
}
} else {
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
@@ -1778,9 +1808,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
copy_dir = "copy_D2H";
}
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0}, -1, -1, -1, -1 });
sched->copy_records.push_back(make_copy_record(copy_dir, split_backend_id, split_id,
copy_start, copy_end, ggml_nbytes(input), input));
} else {
ggml_backend_tensor_copy(input, input_cpy);
}
@@ -1799,9 +1828,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
copy_dir = "copy_D2H";
}
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0}, -1, -1, -1, -1 });
sched->copy_records.push_back(make_copy_record(copy_dir, split_backend_id, split_id,
copy_start, copy_end, ggml_nbytes(input), input));
}
}
}
@@ -2656,7 +2684,7 @@ void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) {
s.max_ns = dur;
s.count = 1;
s.total_bytes = rec.bytes;
memcpy(s.representative_ne, rec.ne_src0, sizeof(s.representative_ne));
memcpy(s.representative_ne, rec.ne_src[0], sizeof(s.representative_ne));
stats.push_back(s);
}
}
@@ -2717,7 +2745,7 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f
}
fprintf(fp, "{\n");
fprintf(fp, " \"version\": 2,\n");
fprintf(fp, " \"version\": 3,\n");
fprintf(fp, " \"profiler\": \"ggml\",\n");
fprintf(fp, " \"total_records\": %d,\n", (int) sched->profiling_records.size());
fprintf(fp, " \"total_ns\": %llu,\n", (unsigned long long) total_ns);
@@ -2763,18 +2791,41 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f
fprintf(fp, "null");
}
// Tensor dimensions (all source tensors)
fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1],
(long long) rec.ne_src0[2], (long long) rec.ne_src0[3]);
fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1],
(long long) rec.ne_src1[2], (long long) rec.ne_src1[3]);
fprintf(fp, ", \"ne_src2\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src2[0], (long long) rec.ne_src2[1],
(long long) rec.ne_src2[2], (long long) rec.ne_src2[3]);
// Output tensor info
fprintf(fp, ", \"ne\": [%lld, %lld, %lld, %lld]", (long long) rec.ne[0], (long long) rec.ne[1],
(long long) rec.ne[2], (long long) rec.ne[3]);
fprintf(fp, ", \"out_type\": %d", rec.out_type);
// Source tensors
fprintf(fp, ", \"n_src\": %d", rec.n_src);
fprintf(fp, ", \"ne_src\": [");
for (int s = 0; s < rec.n_src; s++) {
fprintf(fp, "%s[%lld, %lld, %lld, %lld]", s == 0 ? "" : ", ",
(long long) rec.ne_src[s][0], (long long) rec.ne_src[s][1],
(long long) rec.ne_src[s][2], (long long) rec.ne_src[s][3]);
}
fprintf(fp, "]");
fprintf(fp, ", \"nb_src\": [");
for (int s = 0; s < rec.n_src; s++) {
fprintf(fp, "%s[%lld, %lld, %lld, %lld]", s == 0 ? "" : ", ",
(long long) rec.nb_src[s][0], (long long) rec.nb_src[s][1],
(long long) rec.nb_src[s][2], (long long) rec.nb_src[s][3]);
}
fprintf(fp, "]");
fprintf(fp, ", \"type_src\": [");
for (int s = 0; s < rec.n_src; s++) {
fprintf(fp, "%s%d", s == 0 ? "" : ", ", rec.type_src[s]);
}
fprintf(fp, "]");
// op_params (full 16-int32 block, matching export-graph-ops format)
fprintf(fp, ", \"op_params\": [");
const int n_op_params = (int) (sizeof(rec.op_params) / sizeof(rec.op_params[0]));
for (int p = 0; p < n_op_params; p++) {
fprintf(fp, "%s%d", p == 0 ? "" : ", ", rec.op_params[p]);
}
fprintf(fp, "]");
// Tensor types (quantization)
fprintf(fp, ", \"type_src0\": %d", rec.type_src0);
fprintf(fp, ", \"type_src1\": %d", rec.type_src1);
fprintf(fp, ", \"type_src2\": %d", rec.type_src2);
fprintf(fp, ", \"sub_op\": %d", rec.sub_op);
fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : "");
@@ -2882,9 +2933,9 @@ int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * f
s.max_ns = dur;
s.count = 1;
s.total_bytes = rec.bytes;
memcpy(s.representative_ne_src0, rec.ne_src0, sizeof(s.representative_ne_src0));
memcpy(s.representative_ne_src1, rec.ne_src1, sizeof(s.representative_ne_src1));
memcpy(s.representative_ne_src2, rec.ne_src2, sizeof(s.representative_ne_src2));
memcpy(s.representative_ne_src0, rec.ne_src[0], sizeof(s.representative_ne_src0));
memcpy(s.representative_ne_src1, rec.ne_src[1], sizeof(s.representative_ne_src1));
memcpy(s.representative_ne_src2, rec.ne_src[2], sizeof(s.representative_ne_src2));
stats.push_back(s);
}
}

View File

@@ -274,19 +274,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
rec.end_ns = t_end;
rec.bytes = ggml_nbytes(node);
rec.extra = NULL;
rec.type_src0 = node->src[0] ? (int)node->src[0]->type : -1;
rec.type_src1 = node->src[1] ? (int)node->src[1]->type : -1;
rec.type_src2 = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? (int)node->src[2]->type : -1;
int sub_op = -1;
if (node->op == GGML_OP_UNARY) {
sub_op = (int)ggml_get_unary_op(node);
} else if (node->op == GGML_OP_GLU) {
sub_op = (int)ggml_get_glu_op(node);
}
rec.sub_op = sub_op;
if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
if (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) { memcpy(rec.ne_src2, node->src[2]->ne, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
ggml_profile_record_from_tensor(&rec, node);
ctx->profiling_records.push_back(rec);
}
}

View File

@@ -3056,24 +3056,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (state->ith == 0) {
uint64_t t_end = ggml_profiler_time_ns();
{
static const int64_t zero_ne[4] = {0, 0, 0, 0};
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
int type_src0 = node->src[0] ? (int)node->src[0]->type : -1;
int type_src1 = node->src[1] ? (int)node->src[1]->type : -1;
int type_src2 = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? (int)node->src[2]->type : -1;
int sub_op = -1;
if (node->op == GGML_OP_UNARY) {
sub_op = (int)ggml_get_unary_op(node);
} else if (node->op == GGML_OP_GLU) {
sub_op = (int)ggml_get_glu_op(node);
}
cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
src0_ne, src1_ne, src2_ne, type_src0, type_src1, type_src2, sub_op);
}
cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
node);
}
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {

View File

@@ -174,21 +174,15 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
}
// Callback function for recording CPU profiling events from C code (ggml-cpu.c)
static void ggml_cpu_profiler_record_callback(void * context,
int type,
const char * name,
int split_id,
uint64_t start_ns,
uint64_t end_ns,
uint64_t bytes,
const char * extra,
const int64_t ne_src0[4],
const int64_t ne_src1[4],
const int64_t ne_src2[4],
int type_src0,
int type_src1,
int type_src2,
int sub_op) {
static void ggml_cpu_profiler_record_callback(void * context,
int type,
const char * name,
int split_id,
uint64_t start_ns,
uint64_t end_ns,
uint64_t bytes,
const char * extra,
const struct ggml_tensor * node) {
auto * cpu_ctx = (ggml_backend_cpu_context *) context;
ggml_profile_record rec;
rec.type = (enum ggml_profile_event_type) type;
@@ -199,25 +193,7 @@ static void ggml_cpu_profiler_record_callback(void * context,
rec.end_ns = end_ns;
rec.bytes = bytes;
rec.extra = extra;
rec.type_src0 = type_src0;
rec.type_src1 = type_src1;
rec.type_src2 = type_src2;
rec.sub_op = sub_op;
if (ne_src0) {
memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0));
} else {
memset(rec.ne_src0, 0, sizeof(rec.ne_src0));
}
if (ne_src1) {
memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1));
} else {
memset(rec.ne_src1, 0, sizeof(rec.ne_src1));
}
if (ne_src2) {
memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2));
} else {
memset(rec.ne_src2, 0, sizeof(rec.ne_src2));
}
ggml_profile_record_from_tensor(&rec, node);
cpu_ctx->profiling_records.push_back(rec);
}

View File

@@ -139,8 +139,7 @@ struct ggml_cuda_profiler_state {
}
void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
const int64_t ne_src0[4], const int64_t ne_src1[4], const int64_t ne_src2[4],
int type_src0, int type_src1, int type_src2, int sub_op = -1) {
const ggml_tensor * node) {
cudaEvent_t ev;
(void) cudaEventCreate(&ev);
(void) cudaEventRecord(ev, stream);
@@ -156,13 +155,7 @@ struct ggml_cuda_profiler_state {
rec.end_ns = 0;
rec.bytes = bytes;
rec.extra = extra;
rec.type_src0 = type_src0;
rec.type_src1 = type_src1;
rec.type_src2 = type_src2;
rec.sub_op = sub_op;
if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
if (ne_src2) { memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
ggml_profile_record_from_tensor(&rec, node);
records.push_back(rec);
}
@@ -4485,25 +4478,13 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
int sub_op = -1;
if (node->op == GGML_OP_UNARY) {
sub_op = (int)ggml_get_unary_op(node);
} else if (node->op == GGML_OP_GLU) {
sub_op = (int)ggml_get_glu_op(node);
}
cuda_ctx->profiler_state->record_end(
ggml_op_name(node->op),
-1,
cuda_ctx->profiler_state->split_id,
ggml_nbytes(node),
nullptr,
node->src[0] ? node->src[0]->ne : nullptr,
node->src[1] ? node->src[1]->ne : nullptr,
(node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : nullptr,
node->src[0] ? (int)node->src[0]->type : -1,
node->src[1] ? (int)node->src[1]->type : -1,
(node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? (int)node->src[2]->type : -1,
sub_op
node
);
}

View File

@@ -66,7 +66,8 @@ enum mmvq_parameter_table_id {
MMVQ_PARAMETERS_GCN,
MMVQ_PARAMETERS_RDNA2,
MMVQ_PARAMETERS_RDNA3_0,
MMVQ_PARAMETERS_RDNA4
MMVQ_PARAMETERS_RDNA4,
MMVQ_PARAMETERS_BLACKWELL
};
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
@@ -78,6 +79,8 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
return MMVQ_PARAMETERS_RDNA2;
#elif defined(GCN) || defined(CDNA)
return MMVQ_PARAMETERS_GCN;
#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL
return MMVQ_PARAMETERS_BLACKWELL;
#else
return MMVQ_PARAMETERS_GENERIC;
#endif
@@ -96,6 +99,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
return MMVQ_PARAMETERS_GCN;
}
if (cc >= GGML_CUDA_CC_BLACKWELL) {
return MMVQ_PARAMETERS_BLACKWELL;
}
return MMVQ_PARAMETERS_GENERIC;
}
@@ -294,7 +300,7 @@ static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
}
static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) {
if (table_id == MMVQ_PARAMETERS_GENERIC) {
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_BLACKWELL) {
switch (ncols_dst) {
case 1:
case 2:
@@ -375,7 +381,14 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
switch (ncols_dst) {
case 1:
return small_k ? nwarps : 1;
// Single-token generation: process 2 rows per block instead of 1.
// This halves the number of blocks launched (e.g., 5120→2560), reducing
// kernel launch overhead and improving L1 cache utilization since the y
// vector (quantized src1 as q8_1, ~5.4KB for K=5120) is shared across
// rows within a block. Proven 11.2% improvement on SM86 (RTX 3080).
// When K is very small (small_k path), use all warps for one row to
// maximize K-dimension parallelism.
return small_k ? nwarps : 2;
case 2:
case 3:
case 4:
@@ -388,6 +401,26 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
return 1;
}
}
if (table_id == MMVQ_PARAMETERS_BLACKWELL) {
// Blackwell (SM120): 2 rows/block provided no benefit (hardware scheduler handles
// large grids efficiently). Try 4 rows/block instead for better L1 cache amortization
// of the y vector. The y vector (~5.4KB for K=5120) is loaded once per block and
// reused across 4 rows, reducing memory traffic by 2x vs the 2-row path.
switch (ncols_dst) {
case 1:
return small_k ? nwarps : 4;
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
case 8:
return 4;
default:
return 1;
}
}
return 1;
}

View File

@@ -2,6 +2,7 @@
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#include "ggml.h"
#include <stdio.h>
#include <string.h>
@@ -38,6 +39,55 @@ uint64_t ggml_profiler_time_ns(void) {
#endif
}
//
// Record helpers
//
void ggml_profile_record_from_tensor(ggml_profile_record * rec, const struct ggml_tensor * node) {
if (rec == NULL) {
return;
}
// Output tensor info
if (node != NULL) {
memcpy(rec->ne, node->ne, sizeof(rec->ne));
rec->out_type = (int) node->type;
memcpy(rec->op_params, node->op_params, sizeof(rec->op_params));
} else {
memset(rec->ne, 0, sizeof(rec->ne));
rec->out_type = -1;
memset(rec->op_params, 0, sizeof(rec->op_params));
}
// Sub-op (UNARY/GLU)
rec->sub_op = -1;
if (node != NULL) {
if (node->op == GGML_OP_UNARY) {
rec->sub_op = (int) ggml_get_unary_op(node);
} else if (node->op == GGML_OP_GLU) {
rec->sub_op = (int) ggml_get_glu_op(node);
}
}
// Source tensors
rec->n_src = 0;
for (int i = 0; i < GGML_MAX_SRC; i++) {
const struct ggml_tensor * src = (node != NULL) ? node->src[i] : NULL;
if (src == NULL) {
memset(rec->ne_src[i], 0, sizeof(rec->ne_src[i]));
memset(rec->nb_src[i], 0, sizeof(rec->nb_src[i]));
rec->type_src[i] = -1;
} else {
memcpy(rec->ne_src[i], src->ne, sizeof(rec->ne_src[i]));
for (int d = 0; d < 4; d++) {
rec->nb_src[i][d] = (int64_t) src->nb[d];
}
rec->type_src[i] = (int) src->type;
rec->n_src = i + 1;
}
}
}
//
// Backend profiler registration
//

View File

@@ -15092,10 +15092,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
}
if (has_profiler && node != nullptr) {
static const int64_t zero_ne[4] = {0, 0, 0, 0};
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
? ctx->profiler_state->cpu_timestamps[i] : 0;
@@ -15108,21 +15104,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
rec.end_ns = cpu_ts + duration_ns;
rec.bytes = ggml_nbytes(node);
rec.extra = name; // fusion name or NULL
rec.type_src0 = node->src[0] ? (int)node->src[0]->type : -1;
rec.type_src1 = node->src[1] ? (int)node->src[1]->type : -1;
rec.type_src2 = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? (int)node->src[2]->type : -1;
{
int sub_op = -1;
if (node->op == GGML_OP_UNARY) {
sub_op = (int)ggml_get_unary_op(node);
} else if (node->op == GGML_OP_GLU) {
sub_op = (int)ggml_get_glu_op(node);
}
rec.sub_op = sub_op;
}
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
ggml_profile_record_from_tensor(&rec, node);
ctx->profiler_state->records.push_back(rec);
}
}
@@ -15153,10 +15135,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
? ctx->profiler_state->cpu_timestamps[i] : 0;
// In concurrent mode, report the group as a single combined operation
auto * node = nodes[0];
static const int64_t zero_ne[4] = {0, 0, 0, 0};
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
uint64_t total_bytes = 0;
for (size_t j = 0; j < nodes.size(); j++) {
@@ -15172,21 +15150,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
rec.end_ns = cpu_ts + duration_ns;
rec.bytes = total_bytes;
rec.extra = names[0]; // fusion name of first op, or NULL
rec.type_src0 = node->src[0] ? (int)node->src[0]->type : -1;
rec.type_src1 = node->src[1] ? (int)node->src[1]->type : -1;
rec.type_src2 = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? (int)node->src[2]->type : -1;
{
int sub_op = -1;
if (node->op == GGML_OP_UNARY) {
sub_op = (int)ggml_get_unary_op(node);
} else if (node->op == GGML_OP_GLU) {
sub_op = (int)ggml_get_glu_op(node);
}
rec.sub_op = sub_op;
}
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
ggml_profile_record_from_tensor(&rec, node);
ctx->profiler_state->records.push_back(rec);
}
}

View File

@@ -134,7 +134,10 @@ def _compute_output_ne(op_id: int, ne0: list, ne1: list, ne2: list) -> list | No
if op_id in (46, 25): # SOFT_MAX, RMS_NORM
return list(ne0)
if op_id == 73: # FLASH_ATTN_EXT
return [ne1[1], ne1[1], ne0[2], ne0[3]]
# Per ggml_flash_attn_ext: result.ne = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] }
# When V was not captured (legacy records), fall back to hsk == hsv (q->ne[0]).
hsv = ne2[0] if (ne2 and ne2[0] > 0) else ne0[0]
return [hsv, ne0[2], ne0[1], ne0[3]]
if op_id == 40: # GET_ROWS
return [ne0[0], ne1[1], ne1[2], ne1[3]]
if op_id == 41: # GET_ROWS_BACK
@@ -151,6 +154,10 @@ def _compute_output_ne(op_id: int, ne0: list, ne1: list, ne2: list) -> list | No
return None
GGML_MAX_SRC = 10
GGML_MAX_OP_PARAMS_I32 = 16 # 64 bytes / sizeof(int32)
@dataclass
class ProfileRecord:
type: int
@@ -161,14 +168,42 @@ class ProfileRecord:
duration_ns: int
bytes: int
extra: Optional[str]
ne_src0: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
ne_src1: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
ne_src2: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
type_src0: int = -1
type_src1: int = -1
type_src2: int = -1
# Output tensor info
ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
out_type: int = -1
# Source tensors (variable length, up to GGML_MAX_SRC)
ne_src: list[list[int]] = field(default_factory=list)
nb_src: list[list[int]] = field(default_factory=list)
type_src: list[int] = field(default_factory=list)
# Operation parameters (16 int32, raw from ggml_tensor::op_params)
op_params: list[int] = field(default_factory=lambda: [0] * GGML_MAX_OP_PARAMS_I32)
sub_op: int = -1
# --- Convenience accessors mirroring the old API ---
@property
def ne_src0(self) -> list[int]:
return self.ne_src[0] if len(self.ne_src) > 0 else [0, 0, 0, 0]
@property
def ne_src1(self) -> list[int]:
return self.ne_src[1] if len(self.ne_src) > 1 else [0, 0, 0, 0]
@property
def ne_src2(self) -> list[int]:
return self.ne_src[2] if len(self.ne_src) > 2 else [0, 0, 0, 0]
@property
def type_src0(self) -> int:
return self.type_src[0] if len(self.type_src) > 0 else -1
@property
def type_src1(self) -> int:
return self.type_src[1] if len(self.type_src) > 1 else -1
@property
def type_src2(self) -> int:
return self.type_src[2] if len(self.type_src) > 2 else -1
@property
def sub_op_name(self) -> str:
if self.sub_op < 0:
@@ -209,9 +244,7 @@ class ProfileRecord:
def shape_str(self) -> str:
"""Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1] x [8, 1]'."""
parts = []
for ne, gt in [(self.ne_src0, self.type_src0),
(self.ne_src1, self.type_src1),
(self.ne_src2, self.type_src2)]:
for ne, gt in zip(self.ne_src, self.type_src):
s = self._fmt_ne(ne)
if s:
type_name = GGML_TYPE_NAMES.get(gt, None)
@@ -233,12 +266,13 @@ class ProfileRecord:
"duration_ns": self.duration_ns,
"bytes": self.bytes,
"extra": self.extra,
"ne_src0": self.ne_src0,
"ne_src1": self.ne_src1,
"ne_src2": self.ne_src2,
"type_src0": self.type_src0,
"type_src1": self.type_src1,
"type_src2": self.type_src2,
"ne": self.ne,
"out_type": self.out_type,
"n_src": len(self.ne_src),
"ne_src": self.ne_src,
"nb_src": self.nb_src,
"type_src": self.type_src,
"op_params": self.op_params,
"sub_op": self.sub_op,
}
@@ -310,16 +344,69 @@ class ProfileData:
records = []
def _pad_ne(v):
if isinstance(v, list) and len(v) < 4:
return v + [0] * (4 - len(v))
return list(v) + [0] * (4 - len(v))
if not isinstance(v, list):
return [0, 0, 0, 0]
return v
return list(v)
def _load_sources(r) -> tuple[list[list[int]], list[list[int]], list[int]]:
"""Read source tensor arrays, supporting both v3+ (arrays) and v2 (ne_src0/1/2) JSON."""
ne_list_raw = r.get("ne_src")
if isinstance(ne_list_raw, list):
# v3+ format
ne_src = [_pad_ne(x) for x in ne_list_raw]
nb_raw = r.get("nb_src", [])
if isinstance(nb_raw, list):
nb_src = [_pad_ne(x) for x in nb_raw]
else:
nb_src = []
while len(nb_src) < len(ne_src):
nb_src.append([0, 0, 0, 0])
type_raw = r.get("type_src", [])
if isinstance(type_raw, list):
type_src = [int(t) for t in type_raw]
else:
type_src = []
while len(type_src) < len(ne_src):
type_src.append(-1)
return ne_src, nb_src[:len(ne_src)], type_src[:len(ne_src)]
# Legacy v2 fallback
ne_src: list[list[int]] = []
type_src: list[int] = []
for i in range(3):
key_ne = f"ne_src{i}"
key_type = f"type_src{i}"
ne_v = r.get(key_ne)
if ne_v is None and i == 0:
ne_v = r.get("ne")
if ne_v is None:
break
ne_padded = _pad_ne(ne_v)
if all(v == 0 for v in ne_padded) and i > 0:
break
ne_src.append(ne_padded)
type_src.append(int(r.get(key_type, -1)))
nb_src = [[0, 0, 0, 0] for _ in ne_src]
return ne_src, nb_src, type_src
def _load_op_params(r) -> list[int]:
raw = r.get("op_params")
if isinstance(raw, list):
ops = [int(x) for x in raw[:GGML_MAX_OP_PARAMS_I32]]
while len(ops) < GGML_MAX_OP_PARAMS_I32:
ops.append(0)
return ops
return [0] * GGML_MAX_OP_PARAMS_I32
for r in data.get("records", []):
# Support both old "ne" format and new "ne_src0"/"ne_src1" format
ne_src0 = _pad_ne(r.get("ne_src0", r.get("ne", [0, 0, 0, 0])))
ne_src1 = _pad_ne(r.get("ne_src1", [0, 0, 0, 0]))
ne_src2 = _pad_ne(r.get("ne_src2", [0, 0, 0, 0]))
ne_src, nb_src, type_src = _load_sources(r)
# v3+ records have a real "ne" (output shape). Legacy v2 records did not —
# leave zero so export_graph_ops falls back to op-specific shape inference.
ne_raw = r.get("ne") if "ne_src" in r else None
ne_out = _pad_ne(ne_raw) if isinstance(ne_raw, list) and ne_raw else [0, 0, 0, 0]
records.append(ProfileRecord(
type=r.get("type", 0),
name=r.get("name", "unknown"),
@@ -329,13 +416,13 @@ class ProfileData:
duration_ns=r.get("duration_ns", 0),
bytes=r.get("bytes", 0),
extra=r.get("extra"),
ne_src0=ne_src0,
ne_src1=ne_src1,
ne_src2=ne_src2,
type_src0=r.get("type_src0", -1),
type_src1=r.get("type_src1", -1),
type_src2=r.get("type_src2", -1),
sub_op=r.get("sub_op", -1),
ne=ne_out,
out_type=int(r.get("out_type", -1)),
ne_src=ne_src,
nb_src=nb_src,
type_src=type_src,
op_params=_load_op_params(r),
sub_op=int(r.get("sub_op", -1)),
))
backends_raw = data.get("backends", [])
@@ -566,7 +653,12 @@ class ProfileData:
print(f"Open chrome://tracing in Chrome/Edge and load this file.")
def export_graph_ops(self, filepath: str | Path) -> None:
"""Export operations in export-graph-ops format for test-backend-ops --test-file."""
"""Export operations in export-graph-ops format for test-backend-ops --test-file.
Output line layout matches tests/export-graph-ops.cpp:
<op> <out_type> <ne[0..3]> <n_op_params> <op_params...> <n_sources>
(<src_type> <src_ne[0..3]> <src_nb[0..3]>)* <name|-> [<backend>]
"""
seen: set[tuple] = set()
lines: list[str] = []
@@ -585,41 +677,49 @@ class ProfileData:
if op_id in _EXPORT_SKIP_OPS:
continue
ne0 = rec.ne_src0
ne1 = rec.ne_src1
ne2 = rec.ne_src2
# --- Build the source list directly from the captured arrays ---
sources: list[tuple[int, list[int], list[int]]] = []
for i in range(len(rec.ne_src)):
ne_i = rec.ne_src[i]
if not any(v != 0 for v in ne_i):
continue
src_type = rec.type_src[i] if i < len(rec.type_src) and rec.type_src[i] >= 0 else 0
nb_i = rec.nb_src[i] if i < len(rec.nb_src) else [0, 0, 0, 0]
sources.append((src_type, list(ne_i), list(nb_i)))
type_src0 = rec.type_src0 if rec.type_src0 >= 0 else 0
type_src1 = rec.type_src1 if rec.type_src1 >= 0 else 0
type_src2 = rec.type_src2 if rec.type_src2 >= 0 else 0
# MUL_MAT_ID needs the ids tensor as src[2]; synthesize one if missing.
if op_id == 30 and len(sources) == 2:
sources.append((24, [sources[1][1][1], 1, 1, 1], [0, 0, 0, 0])) # I32
sources: list[tuple[int, list, list]] = []
if any(v != 0 for v in ne0):
sources.append((type_src0, ne0, [0, 0, 0, 0]))
if any(v != 0 for v in ne1):
sources.append((type_src1, ne1, [0, 0, 0, 0]))
if op_id == 30: # MUL_MAT_ID: ensure rows tensor (src2) is present
if len(sources) < 3 and any(v != 0 for v in ne2):
sources.append((type_src2, ne2, [0, 0, 0, 0]))
elif len(sources) < 3 and len(sources) >= 2:
sources.append((24, [sources[1][1][1], 1, 1, 1], [0, 0, 0, 0])) # I32
elif any(v != 0 for v in ne2):
sources.append((type_src2, ne2, [0, 0, 0, 0]))
src_ne0 = sources[0][1] if len(sources) > 0 else [0, 0, 0, 0]
src_ne1 = sources[1][1] if len(sources) > 1 else [0, 0, 0, 0]
src_ne2 = sources[2][1] if len(sources) > 2 else [0, 0, 0, 0]
ne_out = _compute_output_ne(op_id, src_ne0, src_ne1, src_ne2)
if ne_out is None:
if not sources:
continue
op_params: list[int] = []
if op_id == 30 and len(sources) >= 2: # MUL_MAT_ID
op_params.append(sources[1][1][1])
elif op_id in (86, 95) and rec.sub_op >= 0: # UNARY, GLU
op_params.append(rec.sub_op)
# --- Output shape ---
if any(v != 0 for v in rec.ne):
ne_out = list(rec.ne)
else:
# Legacy records without captured output ne: fall back to op-specific formula.
src_ne0 = sources[0][1] if len(sources) > 0 else [0, 0, 0, 0]
src_ne1 = sources[1][1] if len(sources) > 1 else [0, 0, 0, 0]
src_ne2 = sources[2][1] if len(sources) > 2 else [0, 0, 0, 0]
computed = _compute_output_ne(op_id, src_ne0, src_ne1, src_ne2)
if computed is None:
continue
ne_out = computed
# --- Output type ---
out_type = rec.out_type if rec.out_type >= 0 else 0
# --- op_params: emit the full 16-int32 block when captured. ---
if any(p != 0 for p in rec.op_params):
op_params = list(rec.op_params)
else:
# Legacy fallback synthesis (best-effort for v2 JSON files).
op_params = []
if op_id == 30 and len(sources) >= 2: # MUL_MAT_ID
op_params.append(sources[1][1][1])
elif op_id in (86, 95) and rec.sub_op >= 0: # UNARY, GLU
op_params.append(rec.sub_op)
bname = ""
if rec.backend_id in backend_by_id:
@@ -627,24 +727,26 @@ class ProfileData:
if not bname or bname == "unknown":
bname = backend_by_id[rec.backend_id].get("name", "")
key = (op_id, tuple(ne_out), tuple(op_params), tuple((s[0], tuple(s[1])) for s in sources), bname)
key = (op_id, out_type, tuple(ne_out), tuple(op_params),
tuple((s[0], tuple(s[1]), tuple(s[2])) for s in sources), bname)
if key in seen:
continue
seen.add(key)
line = f"{op_id} 0 {ne_out[0]} {ne_out[1]} {ne_out[2]} {ne_out[3]} "
line += f"{len(op_params)}"
for p in op_params:
line += f" {p}"
line += f" {len(sources)}"
parts: list[str] = [str(op_id), str(out_type),
str(ne_out[0]), str(ne_out[1]), str(ne_out[2]), str(ne_out[3])]
parts.append(str(len(op_params)))
parts.extend(str(p) for p in op_params)
parts.append(str(len(sources)))
for src_type, src_ne, src_nb in sources:
line += f" {src_type} {src_ne[0]} {src_ne[1]} {src_ne[2]} {src_ne[3]} {src_nb[0]} {src_nb[1]} {src_nb[2]} {src_nb[3]}"
name = rec.name if rec.name else "-"
line += f" {name}"
parts.append(str(src_type))
parts.extend(str(v) for v in src_ne)
parts.extend(str(v) for v in src_nb)
parts.append(rec.name if rec.name else "-")
if bname:
line += f" {bname}"
line += "\n"
lines.append(line)
parts.append(bname)
lines.append(" ".join(parts) + "\n")
with open(filepath, "w") as f:
f.writelines(lines)