Compare commits

...

7 Commits
b7111 ... b7118

Author SHA1 Message Date
Piotr Wilkin (ilintar)
92c0b387a9 grammar : fix integer overflow (#17381)
* Fix DoS / integer overflow

* Remove optional, use INT64_MAX instead as placeholder value (it's technically -1, so it fits :)

* White space

* Actually, since it's unsigned, use UINT64_MAX
2025-11-20 14:47:04 +02:00
Georgi Gerganov
2286a360ff sync : ggml 2025-11-20 14:10:44 +02:00
YangLe
1d321e592b metal : fix compile on macos 11 (whisper/3533) 2025-11-20 14:10:44 +02:00
Georgi Gerganov
196f5083ef common : more accurate sampling timing (#17382)
* common : more accurate sampling timing

* eval-callback : minor fixes

* cont : add time_meas impl

* cont : fix log msg [no ci]

* cont : fix multiple definitions of time_meas

* llama-cli : exclude chat template init from time measurement

* cont : print percentage of unaccounted time

* cont : do not reset timings
2025-11-20 13:40:10 +02:00
o7si
5088b435d4 convert : fix TypeError when loading base model remotely in convert_lora_to_gguf (#17385)
* fix: TypeError when loading base model remotely in convert_lora_to_gguf

* refactor: simplify base model loading using cache_dir from HuggingFace

* Update convert_lora_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* feat: add remote_hf_model_id to trigger lazy mode in LoRA converter

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-11-20 12:30:12 +01:00
Piotr Wilkin (ilintar)
845f200b28 ggml : Fix transposed SOLVE_TRI result (#17323)
* Did someone transpose the SOLVE_TRI result matrix? Perhaps...

* Update ggml/src/ggml-cpu/ops.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update ggml/src/ggml-cpu/ops.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-11-20 12:58:21 +02:00
Scott Fudally
a7784a8b1d DGX Spark: UMA support (#17368)
* DGX Spark: UMA support

* Updates from PR feedback

* More PR feedback cleanup

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Remove trailing whitespace

* Update ggml/src/ggml-cuda/ggml-cuda.cu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-11-20 12:32:02 +02:00
13 changed files with 231 additions and 49 deletions

View File

@@ -26,7 +26,6 @@
#include <sstream>
#include <string>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <vector>
@@ -60,6 +59,14 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
common_time_meas::~common_time_meas() {
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
}
//
// CPU utils
//

View File

@@ -2,17 +2,15 @@
#pragma once
#include "ggml-opt.h"
#include "llama-cpp.h"
#include <set>
#include <sstream>
#include <string>
#include <string_view>
#include <vector>
#include <map>
#include <sstream>
#include <cmath>
#include "ggml-opt.h"
#include "llama-cpp.h"
#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
@@ -30,6 +28,15 @@
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct common_time_meas {
common_time_meas(int64_t & t_acc, bool disable = false);
~common_time_meas();
const int64_t t_start_us;
int64_t & t_acc;
};
struct common_adapter_lora_info {
std::string path;
float scale;

View File

@@ -3,9 +3,10 @@
#include "common.h"
#include "log.h"
#include <cmath>
#include <unordered_map>
#include <algorithm>
#include <cmath>
#include <cstring>
#include <unordered_map>
// the ring buffer works similarly to std::deque, but with a fixed capacity
// TODO: deduplicate with llama-impl.h
@@ -112,6 +113,13 @@ struct common_sampler {
llama_token_data_array cur_p;
void reset() {
prev.clear();
llama_sampler_reset(grmr);
llama_sampler_reset(chain);
}
void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx);
@@ -128,6 +136,12 @@ struct common_sampler {
cur_p = { cur.data(), cur.size(), -1, false };
}
common_time_meas tm() {
return common_time_meas(t_total_us, params.no_perf);
}
mutable int64_t t_total_us = 0;
};
std::string common_params_sampling::print() const {
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
}
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
const auto tm = gsmpl->tm();
if (accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
}
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
}
void common_sampler_reset(struct common_sampler * gsmpl) {
llama_sampler_reset(gsmpl->grmr);
llama_sampler_reset(gsmpl->chain);
gsmpl->reset();
}
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
// TODO: measure grammar performance
const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
llama_perf_sampler_data data_smpl;
llama_perf_context_data data_ctx;
memset(&data_smpl, 0, sizeof(data_smpl));
memset(&data_ctx, 0, sizeof(data_ctx));
if (gsmpl) {
llama_perf_sampler_print(gsmpl->chain);
auto & data = data_smpl;
data = llama_perf_sampler(gsmpl->chain);
// note: the sampling time includes the samplers time + extra time spent in common/sampling
LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms);
LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
}
if (ctx) {
llama_perf_context_print(ctx);
auto & data = data_ctx;
data = llama_perf_context(ctx);
const double t_end_ms = 1e-3 * ggml_time_us();
const double t_total_ms = t_end_ms - data.t_start_ms;
const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms;
LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
llama_memory_breakdown_print(ctx);
}
}
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
llama_synchronize(ctx);
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
const auto tm = gsmpl->tm();
gsmpl->set_logits(ctx, idx);
auto & grmr = gsmpl->grmr;
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
// helpers
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
const auto tm = gsmpl->tm();
auto * res = &gsmpl->cur_p;
if (do_sort && !res->sorted) {

View File

@@ -277,10 +277,15 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args()
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
from huggingface_hub import try_to_load_from_cache
# normally, adapter does not come with base model config, we need to load it from AutoConfig
config = AutoConfig.from_pretrained(hf_model_id)
return config.to_dict()
cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None
return config.to_dict(), cache_dir
if __name__ == '__main__':
@@ -325,13 +330,13 @@ if __name__ == '__main__':
# load base model
if base_model_id is not None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams = load_hparams_from_hf(base_model_id)
hparams, dir_base_model = load_hparams_from_hf(base_model_id)
elif dir_base_model is None:
if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}")
try:
hparams = load_hparams_from_hf(model_id)
hparams, dir_base_model = load_hparams_from_hf(model_id)
except OSError as e:
logger.error(f"Failed to load base model config: {e}")
logger.error("Please try downloading the base model and add its path to --base")
@@ -480,6 +485,7 @@ if __name__ == '__main__':
dir_lora_model=dir_lora,
lora_alpha=alpha,
hparams=hparams,
remote_hf_model_id=base_model_id,
)
logger.info("Exporting model...")

View File

@@ -4,10 +4,10 @@
#include "llama.h"
#include "ggml.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
#include <numeric>
/**
* This the arbitrary data which will be passed to each callback.
@@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
return u.f;
}
static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
v = *(const float *) &data[i];
} else if (type == GGML_TYPE_I64) {
v = (float) *(int64_t *) &data[i];
v = (float) *(const int64_t *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
v = (float) *(const int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) &data[i];
v = (float) *(const int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
v = (float) *(const int8_t *) &data[i];
} else if (type == GGML_TYPE_BF16) {
v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
} else {
GGML_ABORT("fatal error");
}

View File

@@ -9696,13 +9696,12 @@ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params
for (int64_t i00 = 0; i00 < n; ++i00) {
float sum = 0.0f;
for (int64_t t = 0; t < i00; ++t) {
sum += A_batch[i00 * n + t] * X_batch[i01 * n + t];
sum += A_batch[i00 * n + t] * X_batch[t * k + i01];
}
const float diag = A_batch[i00 * n + i00];
GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix");
X_batch[i01 * n + i00] = (B_batch[i00 * k + i01] - sum) / diag;
X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag;
}
}
}

View File

@@ -3748,10 +3748,110 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str();
}
#if defined(__linux__)
// Helper function to get available memory from /proc/meminfo for UMA systems
static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_kb, long * free_swap_kb) {
FILE * meminfo_file = nullptr;
// 2KB buffer for reading /proc/meminfo since it does not report size info, should be enough
const size_t BUFFER_SIZE = 2048;
auto file_buffer = std::make_unique<char[]>(BUFFER_SIZE);
size_t bytes_read = 0;
long huge_tlb_total_pages = -1;
long huge_tlb_free_pages = -1;
long huge_tlb_page_size = -1;
if (available_memory_kb == nullptr || free_swap_kb == nullptr) {
return false;
}
meminfo_file = fopen("/proc/meminfo", "r");
if (meminfo_file == nullptr) {
GGML_LOG_ERROR("%s: failed to open /proc/meminfo\n", __func__);
return false;
}
// Read file into buffer
bytes_read = fread(file_buffer.get(), 1, BUFFER_SIZE - 1, meminfo_file);
fclose(meminfo_file);
if (bytes_read == 0) {
GGML_LOG_ERROR("%s: failed to read from /proc/meminfo\n", __func__);
return false;
}
file_buffer[bytes_read] = '\0';
*available_memory_kb = -1;
*free_swap_kb = -1;
// Parse the file buffer line by line
char * line = file_buffer.get();
char * line_next;
while (line < file_buffer.get() + bytes_read) {
// Find the end of the current line
line_next = strchr(line, '\n');
if (line_next != nullptr) {
*line_next = '\0';
line_next++;
} else {
line_next = file_buffer.get() + bytes_read;
}
long value;
if (sscanf(line, "MemAvailable: %ld kB", &value) == 1) {
*available_memory_kb = value;
} else if (sscanf(line, "SwapFree: %ld kB", &value) == 1) {
*free_swap_kb = value;
} else if (sscanf(line, "HugePages_Total: %ld", &value) == 1) {
huge_tlb_total_pages = value;
} else if (sscanf(line, "HugePages_Free: %ld", &value) == 1) {
huge_tlb_free_pages = value;
} else if (sscanf(line, "Hugepagesize: %ld kB", &value) == 1) {
huge_tlb_page_size = value;
}
line = line_next;
}
if (huge_tlb_total_pages != 0 && huge_tlb_total_pages != -1) {
*available_memory_kb = huge_tlb_free_pages * huge_tlb_page_size;
// Hugetlbfs pages are not swappable.
*free_swap_kb = 0;
}
GGML_LOG_DEBUG("%s: final available_memory_kb: %ld\n", __func__, *available_memory_kb);
return true;
}
#endif // defined(__linux__)
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemGetInfo(free, total));
// ref: https://github.com/ggml-org/llama.cpp/pull/17368
#if defined(__linux__)
// Check if this is a UMA (Unified Memory Architecture) system
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
// Check if UMA is explicitly enabled via environment variable
bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr;
bool is_uma = prop.unifiedAddressing > 0 || uma_env;
if (is_uma) {
// For UMA systems (like DGX Spark), use system memory info
long available_memory_kb = 0;
long free_swap_kb = 0;
if (ggml_backend_cuda_get_available_uma_memory(&available_memory_kb, &free_swap_kb) && available_memory_kb > 0) {
*free = (size_t)available_memory_kb * 1024;
} else {
GGML_LOG_ERROR("%s: /proc/meminfo reading failed, using cudaMemGetInfo\n", __func__);
}
}
#endif // defined(__linux__)
}
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {

View File

@@ -11,6 +11,7 @@
#include <cassert>
#include <algorithm>
#include <limits>
#include <cmath>
static ggml_metal_buffer_id ggml_metal_get_buffer_id(const ggml_tensor * t) {
if (!t) {

View File

@@ -1 +1 @@
7b6abb2b92fcef35cb01c6ce6ada9bd85306522d
781baf2a14d9e0aaee542b2e1bb918bfc4132199

View File

@@ -6,8 +6,10 @@
#include <cmath>
#include <algorithm>
#include <cstdint>
#include <stdexcept>
#define MAX_REPETITION_THRESHOLD 2000
//
// helpers
//
@@ -345,7 +347,9 @@ const char * llama_grammar_parser::parse_sequence(
size_t last_sym_start = rule.size();
const char * pos = src;
auto handle_repetitions = [&](int min_times, int max_times) {
// use UINT64_MAX as the empty value because we aligned to the proper unsigned long type so -1 can't be used
// (though it's technically the same as -1 now)
auto handle_repetitions = [&](unsigned long min_times, unsigned long max_times) {
if (last_sym_start == rule.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
@@ -373,20 +377,20 @@ const char * llama_grammar_parser::parse_sequence(
rule.resize(last_sym_start);
} else {
// Repeat the previous elements (min_times - 1) times
for (int i = 1; i < min_times; i++) {
for (unsigned long i = 1; i < min_times; i++) {
rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
}
}
uint32_t last_rec_rule_id = 0;
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
auto n_opt = max_times == UINT64_MAX ? 1 : max_times - min_times;
llama_grammar_rule rec_rule(prev_rule);
for (int i = 0; i < n_opt; i++) {
for (unsigned long i = 0; i < n_opt; i++) {
rec_rule.resize(prev_rule.size());
uint32_t rec_rule_id = generate_symbol_id( rule_name);
if (i > 0 || max_times < 0) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
if (i > 0 || max_times == UINT64_MAX) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times == UINT64_MAX ? rec_rule_id : last_rec_rule_id});
}
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
@@ -478,10 +482,10 @@ const char * llama_grammar_parser::parse_sequence(
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
int min_times = std::stoul(std::string(pos, int_end - pos));
unsigned long min_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
int max_times = -1;
unsigned long max_times = UINT64_MAX;
if (*pos == '}') {
max_times = min_times;
@@ -502,6 +506,9 @@ const char * llama_grammar_parser::parse_sequence(
} else {
throw std::runtime_error(std::string("expecting ',' at ") + pos);
}
if (min_times > MAX_REPETITION_THRESHOLD || (max_times != UINT64_MAX && max_times > MAX_REPETITION_THRESHOLD)) {
throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
}
handle_repetitions(min_times, max_times);
} else {
break;

View File

@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
time_meas::~time_meas() {
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
}
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
ggml_log_set(log_callback, user_data);

View File

@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
for (auto * smpl : chain->samplers) {
llama_sampler_reset(smpl);
}
chain->t_sample_us = 0;
chain->n_sample = 0;
}
static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
void llama_perf_sampler_print(const struct llama_sampler * chain) {
const auto data = llama_perf_sampler(chain);
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
}
void llama_perf_sampler_reset(struct llama_sampler * chain) {
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
ctx->t_sample_us = ctx->n_sample = 0;
ctx->t_sample_us = 0;
ctx->n_sample = 0;
}

View File

@@ -147,11 +147,15 @@ int main(int argc, char ** argv) {
return 1;
}
auto * mem = llama_get_memory(ctx);
llama_memory_t mem = llama_get_memory(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
// note: the time for chat template initialization is not negligible:
auto chat_templates = common_chat_templates_init(model, params.chat_template);
// start measuring performance timings from here
llama_perf_context_reset(ctx);
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);