Compare commits

...

4 Commits
b7617 ... b7621

Author SHA1 Message Date
Aman Gupta
e57f52334b ggml-cuda: fixes for concurrent streams (#18496) 2026-01-03 23:15:01 +08:00
Georgi Gerganov
a554a1ecc7 context : fix reserve token padding to n_seqs (#18536) 2026-01-03 15:45:34 +02:00
Johannes Gäßler
0f2e42ca1d CUDA: only allocate FA tmp buffer if needed (#18564) 2026-01-03 13:55:53 +01:00
pl752
9dba9f5352 (Bugfix, ggml-cuda) Pool alloc count fix + small size computation type adjustment (#18559)
* CUDA: Fixed obj byte size instead of obj count being passed to pool alloc (fattn-common, dst_tmp_meta)

* CUDA: Explicitly casted some of the int alloc counts before multiplication in argsort

---------

Co-authored-by: pl752 <maximpl752@gmail.com>
2026-01-03 11:13:40 +01:00
5 changed files with 48 additions and 19 deletions

View File

@@ -29,8 +29,8 @@ static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
const int nrows,
ggml_sort_order order,
cudaStream_t stream) {
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ((size_t) ncols) * nrows);
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ((size_t) ncols) * nrows);
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
int * temp_indices = temp_indices_alloc.get();

View File

@@ -1063,6 +1063,7 @@ struct ggml_cuda_graph {
bool disable_due_to_too_many_updates = false;
bool disable_due_to_failed_graph_capture = false;
int number_consecutive_updates = 0;
bool cuda_graphs_enabled = false;
std::vector<ggml_graph_node_properties> ggml_graph_properties;
#endif
};

View File

@@ -918,7 +918,9 @@ void launch_fattn(
blocks_num.y = 1;
blocks_num.z = 1;
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
}
} else {
const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.

View File

@@ -3253,6 +3253,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
}
}
if (should_launch_concurrent_events) {
// Restore original node order within each concurrent region to enable fusion within streams
@@ -3304,6 +3305,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
}
}
} else {
stream_ctx.concurrent_events.clear();
}
for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -3692,11 +3695,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
}
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device);
static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
#ifdef USE_CUDA_GRAPH
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
@@ -3706,7 +3705,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
}
bool use_cuda_graph = true;
bool cuda_graph_update_required = false;
if (cuda_ctx->cuda_graph->graph == nullptr) {
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
@@ -3727,6 +3725,29 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
use_cuda_graph = false;
}
cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
#else
bool use_cuda_graph = false;
#endif // USE_CUDA_GRAPH
return use_cuda_graph;
}
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
bool use_cuda_graph = false;
bool cuda_graph_update_required = false;
// graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
// we call it here instead.
#ifdef USE_CUDA_GRAPH
if (!cuda_ctx->cuda_graph) {
use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
} else {
use_cuda_graph = cuda_ctx->cuda_graph && cuda_ctx->cuda_graph->cuda_graphs_enabled;
}
if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
@@ -3746,6 +3767,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
#endif
}
}
#endif // USE_CUDA_GRAPH
if (use_cuda_graph && cuda_graph_update_required) {
// Start CUDA graph capture
@@ -3757,11 +3779,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
}
#else
bool use_cuda_graph = false;
bool cuda_graph_update_required = false;
#endif // USE_CUDA_GRAPH
bool graph_evaluated_or_captured = false;
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
@@ -3797,8 +3814,10 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
static bool enable_graph_optimization = [] {
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
return env != nullptr && atoi(env) == 1;
}();
@@ -3806,12 +3825,13 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
return;
}
GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
stream_context.reset();
if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) {
return;
}
// number of out-degrees for a particular node
std::unordered_map<const ggml_tensor *, int> fan_out;
// reverse mapping of node to index in the cgraph
@@ -3872,6 +3892,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
if (count >= min_fan_out && count <= max_fan_out) {
const int root_node_idx = node_indices[root_node];
// only optimize for attn_norm
// TODO: make this more generic
if (!strstr(root_node->name, "attn_norm")) {
continue;
}
bool is_part_of_event = false;
for (const auto & [start, end] : concurrent_node_ranges) {
if (root_node_idx >= start && root_node_idx <= end) {

View File

@@ -1458,7 +1458,7 @@ ggml_cgraph * llama_context::graph_reserve(
if (n_tokens % n_seqs != 0) {
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
n_outputs = std::min(n_outputs, n_tokens);
n_outputs = std::max(n_outputs, n_tokens);
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
}