mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-02 23:24:06 +00:00
Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
be79d9fdd9 | ||
|
|
f432d8d83e | ||
|
|
4067f07fc5 | ||
|
|
4b8560ab56 | ||
|
|
0dd58b6877 | ||
|
|
69ffd89163 | ||
|
|
246c0d9c79 | ||
|
|
3edd87cd05 | ||
|
|
c0b45097c3 | ||
|
|
38dbdf4c05 | ||
|
|
368560a1e3 | ||
|
|
4ca088b036 |
484
common/arg.cpp
484
common/arg.cpp
@@ -57,12 +57,32 @@ static std::string read_file(const std::string & fname) {
|
||||
}
|
||||
|
||||
static void write_file(const std::string & fname, const std::string & content) {
|
||||
std::ofstream file(fname);
|
||||
const std::string fname_tmp = fname + ".tmp";
|
||||
std::ofstream file(fname_tmp);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
||||
}
|
||||
file << content;
|
||||
file.close();
|
||||
|
||||
try {
|
||||
file << content;
|
||||
file.close();
|
||||
|
||||
// Makes write atomic
|
||||
if (rename(fname_tmp.c_str(), fname.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, fname_tmp.c_str(), fname.c_str());
|
||||
// If rename fails, try to delete the temporary file
|
||||
if (remove(fname_tmp.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, fname_tmp.c_str());
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
// If anything fails, try to delete the temporary file
|
||||
if (remove(fname_tmp.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, fname_tmp.c_str());
|
||||
}
|
||||
|
||||
throw std::runtime_error(string_format("error: failed to write file '%s'\n", fname.c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
||||
@@ -217,250 +237,294 @@ struct curl_slist_ptr {
|
||||
}
|
||||
};
|
||||
|
||||
#define CURL_MAX_RETRY 3
|
||||
#define CURL_RETRY_DELAY_SECONDS 2
|
||||
|
||||
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
|
||||
int remaining_attempts = max_attempts;
|
||||
|
||||
while (remaining_attempts > 0) {
|
||||
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res == CURLE_OK) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
||||
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
||||
|
||||
remaining_attempts--;
|
||||
if (remaining_attempts == 0) break;
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||
static CURLcode common_curl_perf(CURL * curl) {
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
LOG_ERR("%s: curl_easy_perform() failed\n", __func__);
|
||||
}
|
||||
|
||||
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
||||
|
||||
return false;
|
||||
return res;
|
||||
}
|
||||
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
|
||||
// Check if the file already exists locally
|
||||
auto file_exists = std::filesystem::exists(path);
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||
struct common_load_model_from_url_headers {
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
std::string accept_ranges;
|
||||
};
|
||||
|
||||
if (file_exists) {
|
||||
if (offline) {
|
||||
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
||||
return true; // skip verification/downloading
|
||||
struct FILE_deleter {
|
||||
void operator()(FILE * f) const { fclose(f); }
|
||||
};
|
||||
|
||||
static size_t common_header_callback(char * buffer, size_t, size_t n_items, void * userdata) {
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||
static std::regex accept_ranges_regex("Accept-Ranges", std::regex_constants::icase);
|
||||
std::string header(buffer, n_items);
|
||||
std::smatch match;
|
||||
if (std::regex_match(header, match, header_regex)) {
|
||||
const std::string & key = match[1];
|
||||
const std::string & value = match[2];
|
||||
if (std::regex_match(key, match, etag_regex)) {
|
||||
headers->etag = value;
|
||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||
headers->last_modified = value;
|
||||
} else if (std::regex_match(key, match, accept_ranges_regex)) {
|
||||
headers->accept_ranges = value;
|
||||
}
|
||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||
} else {
|
||||
if (offline) {
|
||||
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||
struct common_load_model_from_url_headers {
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
};
|
||||
return n_items;
|
||||
}
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
bool head_request_ok = false;
|
||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||
static size_t common_write_callback(void * data, size_t size, size_t nmemb, void * fd) {
|
||||
return std::fwrite(data, size, nmemb, static_cast<FILE *>(fd));
|
||||
}
|
||||
|
||||
// Initialize libcurl
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
// helper function to hide password in URL
|
||||
static std::string llama_download_hide_password_in_url(const std::string & url) {
|
||||
// Use regex to match and replace the user[:password]@ pattern in URLs
|
||||
// Pattern: scheme://[user[:password]@]host[...]
|
||||
static const std::regex url_regex(R"(^(?:[A-Za-z][A-Za-z0-9+.-]://)(?:[^/@]+@)?.$)");
|
||||
std::smatch match;
|
||||
|
||||
if (std::regex_match(url, match, url_regex)) {
|
||||
// match[1] = scheme (e.g., "https://")
|
||||
// match[2] = user[:password]@ part
|
||||
// match[3] = rest of URL (host and path)
|
||||
return match[1].str() + "********@" + match[3].str();
|
||||
}
|
||||
|
||||
return url; // No credentials found or malformed URL
|
||||
}
|
||||
|
||||
static void common_curl_easy_setopt_head(CURL * curl, const std::string & url) {
|
||||
// Set the URL, allow to follow http redirection
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
|
||||
# if defined(_WIN32)
|
||||
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
||||
// operating system. Currently implemented under MS-Windows.
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
# endif
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||
curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, common_header_callback);
|
||||
}
|
||||
|
||||
static void common_curl_easy_setopt_get(CURL * curl) {
|
||||
curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, common_write_callback);
|
||||
|
||||
// display download progress
|
||||
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
||||
}
|
||||
|
||||
static bool common_pull_file(CURL * curl, const std::string & path_temporary) {
|
||||
if (std::filesystem::exists(path_temporary)) {
|
||||
const std::string partial_size = std::to_string(std::filesystem::file_size(path_temporary));
|
||||
LOG_INF("%s: server supports range requests, resuming download from byte %s\n", __func__, partial_size.c_str());
|
||||
const std::string range_str = partial_size + "-";
|
||||
curl_easy_setopt(curl, CURLOPT_RANGE, range_str.c_str());
|
||||
}
|
||||
|
||||
// Always open file in append mode could be resuming
|
||||
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "ab"));
|
||||
if (!outfile) {
|
||||
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_temporary.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
common_curl_easy_setopt_get(curl);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile.get());
|
||||
|
||||
return common_curl_perf(curl) == CURLE_OK;
|
||||
}
|
||||
|
||||
static bool common_download_head(CURL * curl,
|
||||
curl_slist_ptr & http_headers,
|
||||
const std::string & url,
|
||||
const std::string & bearer_token) {
|
||||
if (!curl) {
|
||||
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Set the URL, allow to follow http redirection
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
||||
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
// Check if hf-token or bearer-token was specified
|
||||
if (!bearer_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
}
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
#if defined(_WIN32)
|
||||
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
||||
// operating system. Currently implemented under MS-Windows.
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||
|
||||
std::string header(buffer, n_items);
|
||||
std::smatch match;
|
||||
if (std::regex_match(header, match, header_regex)) {
|
||||
const std::string & key = match[1];
|
||||
const std::string & value = match[2];
|
||||
if (std::regex_match(key, match, etag_regex)) {
|
||||
headers->etag = value;
|
||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||
headers->last_modified = value;
|
||||
}
|
||||
}
|
||||
return n_items;
|
||||
};
|
||||
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
// we only allow retrying once for HEAD requests
|
||||
// this is for the use case of using running offline (no internet), retrying can be annoying
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
||||
if (!was_perform_successful) {
|
||||
head_request_ok = false;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code == 200) {
|
||||
head_request_ok = true;
|
||||
} else {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
head_request_ok = false;
|
||||
}
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
common_curl_easy_setopt_head(curl, url);
|
||||
return common_curl_perf(curl) == CURLE_OK;
|
||||
}
|
||||
|
||||
// if head_request_ok is false, we don't have the etag or last-modified headers
|
||||
// we leave should_download as-is, which is true if the file does not exist
|
||||
if (head_request_ok) {
|
||||
// check if ETag or Last-Modified headers are different
|
||||
// if it is, we need to download the file again
|
||||
if (!etag.empty() && etag != headers.etag) {
|
||||
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
||||
should_download = true;
|
||||
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
||||
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
||||
should_download = true;
|
||||
}
|
||||
}
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
bool offline) {
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
for (int i = 0; i < max_attempts; ++i) {
|
||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
|
||||
if (should_download) {
|
||||
std::string path_temporary = path + ".downloadInProgress";
|
||||
// Check if the file already exists locally
|
||||
const auto file_exists = std::filesystem::exists(path);
|
||||
if (file_exists) {
|
||||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
if (offline) {
|
||||
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
||||
return true; // skip verification/downloading
|
||||
}
|
||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
|
||||
metadata.dump().c_str());
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||
} else {
|
||||
if (offline) {
|
||||
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
// Set the output file
|
||||
bool head_request_ok = false;
|
||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||
|
||||
struct FILE_deleter {
|
||||
void operator()(FILE * f) const {
|
||||
fclose(f);
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
||||
if (!outfile) {
|
||||
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
||||
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
||||
return fwrite(data, size, nmemb, (FILE *)fd);
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
||||
|
||||
// display download progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
||||
|
||||
// helper function to hide password in URL
|
||||
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
||||
std::size_t protocol_pos = url.find("://");
|
||||
if (protocol_pos == std::string::npos) {
|
||||
return url; // Malformed URL
|
||||
}
|
||||
|
||||
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
||||
if (at_pos == std::string::npos) {
|
||||
return url; // No password in URL
|
||||
}
|
||||
|
||||
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
||||
};
|
||||
|
||||
// start the download
|
||||
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
|
||||
// Initialize libcurl
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
common_load_model_from_url_headers headers;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
curl_slist_ptr http_headers;
|
||||
const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code < 200 || http_code >= 400) {
|
||||
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||
return false;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code == 200) {
|
||||
head_request_ok = true;
|
||||
} else {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
// Causes file to be closed explicitly here before we rename it.
|
||||
outfile.reset();
|
||||
|
||||
// Write the updated JSON metadata file.
|
||||
metadata.update({
|
||||
{"url", url},
|
||||
{"etag", headers.etag},
|
||||
{"lastModified", headers.last_modified}
|
||||
});
|
||||
write_file(metadata_path, metadata.dump(4));
|
||||
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||
|
||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return false;
|
||||
// if head_request_ok is false, we don't have the etag or last-modified headers
|
||||
// we leave should_download as-is, which is true if the file does not exist
|
||||
bool should_download_from_scratch = false;
|
||||
if (head_request_ok) {
|
||||
// check if ETag or Last-Modified headers are different
|
||||
// if it is, we need to download the file again
|
||||
if (!etag.empty() && etag != headers.etag) {
|
||||
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(),
|
||||
headers.etag.c_str());
|
||||
should_download = true;
|
||||
should_download_from_scratch = true;
|
||||
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
||||
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
|
||||
last_modified.c_str(), headers.last_modified.c_str());
|
||||
should_download = true;
|
||||
should_download_from_scratch = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
|
||||
const bool accept_ranges_supported = !headers.accept_ranges.empty() && headers.accept_ranges != "none";
|
||||
if (should_download) {
|
||||
if (file_exists &&
|
||||
!accept_ranges_supported) { // Resumable downloads not supported, delete and start again.
|
||||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
const std::string path_temporary = path + ".downloadInProgress";
|
||||
if (should_download_from_scratch) {
|
||||
if (std::filesystem::exists(path_temporary)) {
|
||||
if (remove(path_temporary.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (std::filesystem::exists(path)) {
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write the updated JSON metadata file.
|
||||
metadata.update({
|
||||
{ "url", url },
|
||||
{ "etag", headers.etag },
|
||||
{ "lastModified", headers.last_modified }
|
||||
});
|
||||
write_file(metadata_path, metadata.dump(4));
|
||||
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||
|
||||
// start the download
|
||||
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
|
||||
__func__, llama_download_hide_password_in_url(url).c_str(), path_temporary.c_str(),
|
||||
headers.etag.c_str(), headers.last_modified.c_str());
|
||||
const bool was_pull_successful = common_pull_file(curl.get(), path_temporary);
|
||||
if (!was_pull_successful) {
|
||||
if (i + 1 < max_attempts) {
|
||||
const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
|
||||
LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||
} else {
|
||||
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code < 200 || http_code >= 400) {
|
||||
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -770,7 +834,7 @@ static std::string common_docker_get_token(const std::string & repo) {
|
||||
}
|
||||
|
||||
static std::string common_docker_resolve_model(const std::string & docker) {
|
||||
// Parse ai/smollm2:135M-Q4_K_M
|
||||
// Parse ai/smollm2:135M-Q4_0
|
||||
size_t colon_pos = docker.find(':');
|
||||
std::string repo, tag;
|
||||
if (colon_pos != std::string::npos) {
|
||||
|
||||
@@ -1741,10 +1741,12 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
|
||||
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
LOG_DBG("%s\n", __func__);
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
|
||||
const std::optional<json> tools_override = json();
|
||||
const std::optional<json> additional_context = json {
|
||||
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
||||
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
||||
});
|
||||
};
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
|
||||
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
@@ -2230,15 +2232,28 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
|
||||
|
||||
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
||||
// Parse thinking tags
|
||||
static const common_regex start_think_regex(regex_escape("<think>"));
|
||||
static const common_regex end_think_regex(regex_escape("</think>"));
|
||||
// Granite models output partial tokens such as "<" and "<think".
|
||||
// By leveraging try_consume_regex()/try_find_regex() throwing
|
||||
// common_chat_msg_partial_exception for these partial tokens,
|
||||
// processing is interrupted and the tokens are not passed to add_content().
|
||||
if (auto res = builder.try_consume_regex(start_think_regex)) {
|
||||
// Restore position for try_parse_reasoning()
|
||||
builder.move_to(res->groups[0].begin);
|
||||
builder.try_find_regex(end_think_regex, std::string::npos, false);
|
||||
// Restore position for try_parse_reasoning()
|
||||
builder.move_to(res->groups[0].begin);
|
||||
}
|
||||
builder.try_parse_reasoning("<think>", "</think>");
|
||||
|
||||
// Parse response tags using regex
|
||||
static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
|
||||
if (auto res = builder.try_find_regex(response_regex)) {
|
||||
// Extract the content between the tags (capture group 1)
|
||||
auto content = builder.str(res->groups[1]);
|
||||
builder.add_content(content);
|
||||
builder.move_to(res->groups[0].end);
|
||||
// Parse response tags
|
||||
static const common_regex start_response_regex(regex_escape("<response>"));
|
||||
static const common_regex end_response_regex(regex_escape("</response>"));
|
||||
// Granite models output partial tokens such as "<" and "<response".
|
||||
// Same hack as reasoning parsing.
|
||||
if (builder.try_consume_regex(start_response_regex)) {
|
||||
builder.try_find_regex(end_response_regex);
|
||||
}
|
||||
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
@@ -2252,13 +2267,10 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
||||
builder.move_to(res->groups[0].end);
|
||||
|
||||
// Expect JSON array of tool calls
|
||||
auto tool_calls_data = builder.consume_json();
|
||||
if (tool_calls_data.json.is_array()) {
|
||||
if (!builder.add_tool_calls(tool_calls_data.json)) {
|
||||
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
||||
if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
|
||||
if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
|
||||
throw common_chat_msg_partial_exception("incomplete tool call");
|
||||
}
|
||||
} else {
|
||||
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
||||
}
|
||||
} else {
|
||||
builder.add_content(builder.consume_rest());
|
||||
|
||||
@@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
|
||||
|
||||
if (NOT MSVC)
|
||||
if (GGML_STATIC)
|
||||
if (UNIX AND NOT APPLE)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
|
||||
endif()
|
||||
add_link_options(-static)
|
||||
if (MINGW)
|
||||
add_link_options(-static-libgcc -static-libstdc++)
|
||||
|
||||
@@ -116,7 +116,7 @@ extern "C" {
|
||||
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
||||
// (optional) sort/optimize the nodes in the graph
|
||||
void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
};
|
||||
|
||||
struct ggml_backend {
|
||||
|
||||
@@ -463,10 +463,10 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
|
||||
backend->iface.event_wait(backend, event);
|
||||
}
|
||||
|
||||
static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
GGML_ASSERT(backend);
|
||||
if (backend->iface.optimize_graph != NULL) {
|
||||
backend->iface.optimize_graph(backend, cgraph);
|
||||
if (backend->iface.graph_optimize != NULL) {
|
||||
backend->iface.graph_optimize(backend, cgraph);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1307,7 +1307,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||
|
||||
// Optimize this split of the graph. This needs to happen before we make graph_copy,
|
||||
// so they are in sync.
|
||||
ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
|
||||
ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
|
||||
|
||||
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
||||
for (int j = 0; j < split->n_inputs; j++) {
|
||||
|
||||
@@ -270,7 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
|
||||
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_blas_guid(void) {
|
||||
|
||||
@@ -2756,7 +2756,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
||||
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
||||
/* .event_record = */ ggml_backend_cann_event_record,
|
||||
/* .event_wait = */ ggml_backend_cann_event_wait,
|
||||
/* .optimize_graph = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#include "ggml-cpu.h"
|
||||
#include "traits.h"
|
||||
|
||||
#if defined(__gnu_linux__)
|
||||
#if defined(__linux__)
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
@@ -186,7 +186,7 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
|
||||
#define XFEATURE_XTILEDATA 18
|
||||
|
||||
static bool ggml_amx_init() {
|
||||
#if defined(__gnu_linux__)
|
||||
#if defined(__linux__)
|
||||
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
||||
fprintf(stderr, "AMX is not ready to be used!\n");
|
||||
return false;
|
||||
@@ -194,6 +194,8 @@ static bool ggml_amx_init() {
|
||||
return true;
|
||||
#elif defined(_WIN32)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
|
||||
return GGML_BF16_TO_FP32(x);
|
||||
}
|
||||
|
||||
static inline float i32_to_f32(int32_t x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline int32_t f32_to_i32(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline float f32_to_f32(float x) {
|
||||
return x;
|
||||
}
|
||||
@@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
|
||||
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<int32_t> {
|
||||
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
|
||||
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
|
||||
};
|
||||
|
||||
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
|
||||
const int64_t ith = params->ith;
|
||||
const int64_t nth = params->nth;
|
||||
|
||||
@@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -652,6 +652,14 @@ static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fa
|
||||
return n - fastdiv(n, fastdiv_values) * fastdiv_values.z;
|
||||
}
|
||||
|
||||
// Calculate both division and modulo at once, returns <n/divisor, n%divisor>
|
||||
static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) {
|
||||
// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
|
||||
const uint32_t div_val = fastdiv(n, fastdiv_values);
|
||||
const uint32_t mod_val = n - div_val * fastdiv_values.z;
|
||||
return make_uint2(div_val, mod_val);
|
||||
}
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
|
||||
|
||||
static __device__ __forceinline__ float get_alibi_slope(
|
||||
|
||||
@@ -35,7 +35,6 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int
|
||||
switch (D) {
|
||||
case 64:
|
||||
case 128:
|
||||
return 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
@@ -86,7 +85,6 @@ static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols
|
||||
switch (D) {
|
||||
case 64:
|
||||
case 128:
|
||||
return 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
|
||||
@@ -3140,7 +3140,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .optimize_graph = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cuda_guid() {
|
||||
|
||||
@@ -1,82 +1,89 @@
|
||||
#include "pad_reflect_1d.cuh"
|
||||
|
||||
static __global__ void pad_reflect_1d_kernel_f32(
|
||||
const void * __restrict__ src0,
|
||||
void * __restrict__ dst,
|
||||
const int64_t ne0,
|
||||
const int64_t ne00,
|
||||
const int64_t ne01,
|
||||
const int64_t ne02,
|
||||
const int64_t ne03,
|
||||
const int64_t nb00,
|
||||
const int64_t nb01,
|
||||
const int64_t nb02,
|
||||
const int64_t nb03,
|
||||
const int64_t nb0,
|
||||
const int64_t nb1,
|
||||
const int64_t nb2,
|
||||
const int64_t nb3,
|
||||
const int p0,
|
||||
const int p1) {
|
||||
|
||||
static __global__ __launch_bounds__(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1) void
|
||||
pad_reflect_1d_kernel_f32(
|
||||
const void * __restrict__ src0,
|
||||
void * __restrict__ dst,
|
||||
const int64_t ne0,
|
||||
const int64_t ne00,
|
||||
const uint3 ne01,
|
||||
const int64_t ne02,
|
||||
const int64_t ne03,
|
||||
const int64_t nb00,
|
||||
const int64_t nb01,
|
||||
const int64_t nb02,
|
||||
const int64_t nb03,
|
||||
const int64_t nb0,
|
||||
const int64_t nb1,
|
||||
const int64_t nb2,
|
||||
const int64_t nb3,
|
||||
const int p0,
|
||||
const int p1) {
|
||||
const int64_t i3 = blockIdx.z;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
const int64_t i1 = blockIdx.x;
|
||||
|
||||
if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
|
||||
const uint2 div_mod_packed = fast_div_modulo(blockIdx.x, ne01);
|
||||
const int64_t tile1 = div_mod_packed.y; // i1
|
||||
const int64_t tile0 = div_mod_packed.x; // nth i0 tile
|
||||
const int64_t i1 = tile1;
|
||||
const int64_t i0 = threadIdx.x + tile0 * blockDim.x;
|
||||
|
||||
// ne01.z is original value of unpacked ne01 (see init_fastdiv_values in common.cuh)
|
||||
if (i0 >= ne0 || i1 >= ne01.z || i2 >= ne02 || i3 >= ne03) {
|
||||
return;
|
||||
}
|
||||
|
||||
const char * src0_ptr = (const char *)src0 + i3*nb03 + i2*nb02 + i1*nb01;
|
||||
char * dst_ptr = (char *)dst + i3*nb3 + i2*nb2 + i1*nb1;
|
||||
const char * src0_ptr = (const char *) src0 + i3 * nb03 + i2 * nb02 + i1 * nb01;
|
||||
char * dst_ptr = (char *) dst + i3 * nb3 + i2 * nb2 + i1 * nb1;
|
||||
|
||||
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||
float value;
|
||||
const int64_t rel_i0 = i0 - p0; // relative i0 in src0
|
||||
int64_t src_idx;
|
||||
|
||||
if (i0 < p0) {
|
||||
// Left padding - reflect
|
||||
value = *(const float *)(src0_ptr + (p0 - i0) * nb00);
|
||||
} else if (i0 < ne0 - p1) {
|
||||
// Middle - copy
|
||||
value = *(const float *)(src0_ptr + (i0 - p0) * nb00);
|
||||
} else {
|
||||
// Right padding - reflect
|
||||
int64_t src_idx = (ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1;
|
||||
value = *(const float *)(src0_ptr + src_idx * nb00);
|
||||
}
|
||||
|
||||
*(float *)(dst_ptr + i0 * nb0) = value;
|
||||
if (rel_i0 < 0) {
|
||||
// Left padding - reflect
|
||||
src_idx = -rel_i0;
|
||||
} else if (rel_i0 < ne00) {
|
||||
// Middle - copy
|
||||
src_idx = rel_i0;
|
||||
} else {
|
||||
// Right padding - reflect
|
||||
src_idx = 2 * ne00 - 2 - rel_i0;
|
||||
}
|
||||
const float value = *(const float *) (src0_ptr + src_idx * nb00);
|
||||
*(float *) (dst_ptr + i0 * nb0) = value;
|
||||
}
|
||||
|
||||
void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
cudaStream_t stream = ctx.stream();
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t * opts = (const int32_t *) dst->op_params;
|
||||
const int p0 = opts[0];
|
||||
const int p1 = opts[1];
|
||||
const int p0 = opts[0];
|
||||
const int p1 = opts[1];
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const uint3 ne01_packed = init_fastdiv_values(ne01);
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
|
||||
// sanity: padded length matches
|
||||
GGML_ASSERT(ne0 == ne00 + p0 + p1);
|
||||
|
||||
const dim3 block_dims(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1, 1);
|
||||
const dim3 grid_dims(ne01, ne02, ne03);
|
||||
constexpr int64_t bx = CUDA_PAD_REFLECT_1D_BLOCK_SIZE; // threads per block (x)
|
||||
const int64_t tiles0 = (ne0 + bx - 1) / bx; // number of tiles along i0
|
||||
// grid.x covers i1 and all tiles of i0: [ne01 * tiles0]
|
||||
// grid.y covers i2: [ne02]
|
||||
// grid.z covers i3: [ne03]
|
||||
const dim3 grid_dims((unsigned) (ne01 * tiles0), (unsigned) ne02, (unsigned) ne03);
|
||||
const dim3 block_dims((unsigned) bx, 1, 1);
|
||||
|
||||
pad_reflect_1d_kernel_f32<<<grid_dims, block_dims, 0, stream>>>(
|
||||
src0->data, dst->data,
|
||||
ne0, ne00, ne01, ne02, ne03,
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||
p0, p1
|
||||
);
|
||||
src0->data, dst->data, ne0, ne00, ne01_packed, ne02, ne03, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], p0, p1);
|
||||
}
|
||||
|
||||
@@ -447,7 +447,7 @@ static ggml_backend_i ggml_backend_metal_i = {
|
||||
// https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ ggml_backend_metal_graph_optimize,
|
||||
/* .graph_optimize = */ ggml_backend_metal_graph_optimize,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_metal_guid(void) {
|
||||
|
||||
@@ -83,8 +83,10 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mv_q4_0_f32_1d_16x_flat
|
||||
mul_mv_q6_k
|
||||
mul_mv_mxfp4_f32
|
||||
mul_mv_mxfp4_f32_flat
|
||||
mul_mv_id_q4_0_f32_8x_flat
|
||||
mul_mv_id_mxfp4_f32
|
||||
mul_mv_id_mxfp4_f32_flat
|
||||
mul_mm_f32_f32_l4_lm
|
||||
mul_mm_f16_f32_l4_lm
|
||||
mul
|
||||
|
||||
@@ -368,6 +368,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
|
||||
cl_program program_mul_mv_q6_K;
|
||||
cl_program program_mul_mv_mxfp4_f32;
|
||||
cl_program program_mul_mv_mxfp4_f32_flat;
|
||||
cl_program program_mul_mv_f16_f16;
|
||||
cl_program program_mul_mv_f16_f32_1row;
|
||||
cl_program program_mul_mv_f16_f32_l4;
|
||||
@@ -402,6 +403,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_tsembd;
|
||||
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
||||
cl_program program_mul_mv_id_mxfp4_f32;
|
||||
cl_program program_mul_mv_id_mxfp4_f32_flat;
|
||||
cl_program program_mul_mm_f32_f32_l4_lm;
|
||||
cl_program program_mul_mm_f16_f32_l4_lm;
|
||||
|
||||
@@ -447,11 +449,12 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul_mat_f16_f32_tiled;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
||||
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
||||
cl_kernel kernel_convert_block_mxfp4, kernel_restore_block_mxfp4;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
||||
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q6_K_f32;
|
||||
cl_kernel kernel_mul_mv_mxfp4_f32;
|
||||
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
|
||||
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
||||
cl_kernel kernel_argsort_f32_i32;
|
||||
cl_kernel kernel_sum_rows_f32;
|
||||
@@ -469,6 +472,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
cl_kernel kernel_mul_mv_id_mxfp4_f32;
|
||||
cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
|
||||
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
|
||||
|
||||
@@ -765,6 +769,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
@@ -1002,6 +1008,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_mxfp4_f32_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_mxfp4_f32_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_mxfp4_f32_flat.cl");
|
||||
#endif
|
||||
backend_ctx->program_mul_mv_mxfp4_f32_flat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32_flat, "kernel_mul_mv_mxfp4_f32_flat", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_f16_f16
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -1727,6 +1749,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_id_mxfp4_f32_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_id_mxfp4_f32_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32_flat.cl");
|
||||
#endif
|
||||
backend_ctx->program_mul_mv_id_mxfp4_f32_flat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32_flat, "kernel_mul_mv_id_mxfp4_f32_flat", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// Adreno kernels
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// transpose
|
||||
@@ -2391,6 +2429,51 @@ struct ggml_tensor_extra_cl_q4_0 {
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_cl_mxfp4 {
|
||||
// Quantized values.
|
||||
cl_mem q = nullptr;
|
||||
// Quantized values in image1d_buffer_t.
|
||||
cl_mem q_img = nullptr;
|
||||
// Scales in E8M0.
|
||||
cl_mem e = nullptr;
|
||||
// Scales in image1d_buffer_t.
|
||||
cl_mem e_img = nullptr;
|
||||
// Size of quantized values.
|
||||
size_t size_q = 0;
|
||||
// Size of scales.
|
||||
size_t size_e = 0;
|
||||
|
||||
~ggml_tensor_extra_cl_mxfp4() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
// q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
|
||||
// They must be properly released so that the original buffer can be
|
||||
// properly released to avoid memory leak.
|
||||
if (q != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(q));
|
||||
q = nullptr;
|
||||
}
|
||||
if (e != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(e));
|
||||
e = nullptr;
|
||||
}
|
||||
if (q != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(q_img));
|
||||
q = nullptr;
|
||||
}
|
||||
// Currently, q_img and d_img are only initialized when SMALL_ALLOC is
|
||||
// enabled. They point to the images in ggml_backend_opencl_buffer_context.
|
||||
// So, there is no need to release them here.
|
||||
// TODO: initialize them for non SMALL_PATH path, or remove them.
|
||||
q_img = nullptr;
|
||||
e_img = nullptr;
|
||||
size_q = 0;
|
||||
size_e = 0;
|
||||
}
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Backend API
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -2838,7 +2921,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
||||
/* .graph_compute = */ ggml_backend_opencl_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_t ggml_backend_opencl_init(void) {
|
||||
@@ -2894,6 +2977,12 @@ struct ggml_backend_opencl_buffer_context {
|
||||
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
|
||||
delete e;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
|
||||
@@ -2926,6 +3015,21 @@ struct ggml_backend_opencl_buffer_context {
|
||||
return extra;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() {
|
||||
ggml_tensor_extra_cl_mxfp4 * extra;
|
||||
if (temp_tensor_extras_mxfp4.empty()) {
|
||||
extra = new ggml_tensor_extra_cl_mxfp4();
|
||||
} else {
|
||||
extra = temp_tensor_extras_mxfp4.back();
|
||||
temp_tensor_extras_mxfp4.pop_back();
|
||||
}
|
||||
|
||||
temp_tensor_extras_mxfp4_in_use.push_back(extra);
|
||||
|
||||
extra->reset();
|
||||
return extra;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
|
||||
temp_tensor_extras.push_back(e);
|
||||
@@ -2936,6 +3040,11 @@ struct ggml_backend_opencl_buffer_context {
|
||||
temp_tensor_extras_q4_0.push_back(e);
|
||||
}
|
||||
temp_tensor_extras_q4_0_in_use.clear();
|
||||
|
||||
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
|
||||
temp_tensor_extras_mxfp4.push_back(e);
|
||||
}
|
||||
temp_tensor_extras_mxfp4_in_use.clear();
|
||||
}
|
||||
|
||||
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
|
||||
@@ -2947,6 +3056,8 @@ struct ggml_backend_opencl_buffer_context {
|
||||
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
|
||||
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
|
||||
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
|
||||
|
||||
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
|
||||
// before any tensor is initialized (at the beginning of alloc_tensor_range).
|
||||
@@ -3289,6 +3400,76 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_MXFP4) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
||||
|
||||
// Allocate the new extra and create aliases from the original.
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4();
|
||||
|
||||
size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char);
|
||||
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
||||
GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK(clEnqueueWriteBuffer(
|
||||
queue, data_device, CL_TRUE, 0,
|
||||
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
||||
|
||||
// The original tensor memory is divided into scales and quants, i.e.,
|
||||
// we first store scales, then quants.
|
||||
cl_buffer_region region;
|
||||
|
||||
// Create subbuffer for scales.
|
||||
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
||||
region.size = size_e;
|
||||
extra->e = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
auto previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for quants.
|
||||
region.origin = align_to(previous_origin + size_e, backend_ctx->alignment);
|
||||
region.size = size_q;
|
||||
extra->q = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
// Create image for Q
|
||||
cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
|
||||
cl_image_desc img_desc_q = {
|
||||
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
||||
static_cast<size_t>(ggml_nelements(tensor)/32*2),
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
{ extra->q }
|
||||
};
|
||||
extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
|
||||
|
||||
tensor->extra = extra;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_SOA_Q
|
||||
@@ -3337,6 +3518,31 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
} else if (tensor->type == GGML_TYPE_MXFP4) {
|
||||
ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
@@ -3658,6 +3864,19 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(queue));
|
||||
} else if (tensor->type == GGML_TYPE_MXFP4) {
|
||||
ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *) tensor->extra;
|
||||
GGML_ASSERT(extra);
|
||||
|
||||
size_t size_q = ggml_nelements(tensor)/QK_MXFP4 * QK_MXFP4/2;
|
||||
size_t size_e = ggml_nelements(tensor)/QK_MXFP4 * sizeof(char);
|
||||
GGML_ASSERT(size_q + size_e == ggml_nbytes(tensor));
|
||||
buf_q = malloc(size_q);
|
||||
buf_d = malloc(size_e);
|
||||
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_e, buf_d, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(queue));
|
||||
} else {
|
||||
// Read out the tensor from GPU memory.
|
||||
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
|
||||
@@ -6048,6 +6267,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
||||
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
|
||||
#endif
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
@@ -6752,6 +6972,45 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
||||
break;
|
||||
case GGML_TYPE_MXFP4: {
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
kernel = backend_ctx->kernel_mul_mv_mxfp4_f32_flat;
|
||||
|
||||
cl_mem q;
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
nth0 = 16;
|
||||
nth1 = 2;
|
||||
ndst = nth1*2;
|
||||
|
||||
q = extra0_mxfp4->q;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
nth1 = 2;
|
||||
ndst = nth1*2;
|
||||
|
||||
q = extra0_mxfp4->q_img;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_mxfp4->e));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
|
||||
#else
|
||||
kernel = backend_ctx->kernel_mul_mv_mxfp4_f32;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
@@ -6785,6 +7044,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float)*nth0,nullptr));
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -6850,8 +7110,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
GGML_UNUSED(offset0);
|
||||
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
||||
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
|
||||
#endif
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
@@ -6940,6 +7203,51 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_MXFP4: {
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat;
|
||||
|
||||
cl_mem q;
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
sgs = 16;
|
||||
nsg = 2;
|
||||
ndst = 2;
|
||||
|
||||
q = extra0_mxfp4->q;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
sgs = 64;
|
||||
nsg = 1;
|
||||
ndst = 4;
|
||||
|
||||
q = extra0_mxfp4->q_img;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_mxfp4->e));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne20));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
|
||||
#else // GGML_OPENCL_SOA_Q
|
||||
kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
@@ -6979,7 +7287,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,nullptr));
|
||||
|
||||
#endif // GGML_OPENCL_SOA_Q
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
||||
@@ -116,3 +116,49 @@ kernel void kernel_convert_block_q4_0_noshuffle(
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_0
|
||||
//------------------------------------------------------------------------------
|
||||
#define QK_MXFP4 32
|
||||
struct block_mxfp4 {
|
||||
uchar e; // E8M0
|
||||
uchar qs[QK_MXFP4 / 2];
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_mxfp4
|
||||
// Convert the block_mxfp4 format to 2 separate arrays (AOS -> SOA).
|
||||
// This kernel does not deshuffle the bits.
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_convert_block_mxfp4(
|
||||
global struct block_mxfp4 * src0,
|
||||
global uchar * dst_q,
|
||||
global uchar * dst_e
|
||||
) {
|
||||
global struct block_mxfp4 * b = (global struct block_mxfp4 *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK_MXFP4 / 2 * get_global_id(0);
|
||||
global uchar * e = (global uchar *) dst_e + get_global_id(0);
|
||||
|
||||
*e = b->e;
|
||||
|
||||
for (int i = 0; i < QK_MXFP4 / 2; ++i) {
|
||||
q[i] = b->qs[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_mxfp4(
|
||||
global uchar * src_q,
|
||||
global half * src_e,
|
||||
global struct block_mxfp4 * dst
|
||||
) {
|
||||
global struct block_mxfp4 * b = (global struct block_mxfp4 *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK_MXFP4 / 2 * get_global_id(0);
|
||||
global uchar * e = (global uchar *) src_e + get_global_id(0);
|
||||
|
||||
b->e = *e;
|
||||
for (int i = 0; i < QK_MXFP4 / 2; ++i) {
|
||||
b->qs[i] = q[i];
|
||||
}
|
||||
}
|
||||
|
||||
176
ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl
Normal file
176
ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl
Normal file
@@ -0,0 +1,176 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK_MXFP4 32
|
||||
|
||||
static inline half4 mxfp4_to_fp16_packed(ushort fp4x4) {
|
||||
ushort2 fp16_packed_a, fp16_packed_b, bias_a, bias_b, sign_a, sign_b;
|
||||
fp16_packed_a.lo = (fp4x4 << 9) & 0x0E00;
|
||||
fp16_packed_a.hi = (fp4x4 << 5) & 0x0E00;
|
||||
fp16_packed_b.lo = (fp4x4 << 1) & 0x0E00;
|
||||
fp16_packed_b.hi = (fp4x4 >> 3) & 0x0E00;
|
||||
|
||||
bias_a.lo = (fp16_packed_a.lo == 0) ? 0x0 : 0x3800;
|
||||
bias_a.hi = (fp16_packed_a.hi == 0) ? 0x0 : 0x3800;
|
||||
bias_b.lo = (fp16_packed_b.lo == 0) ? 0x0 : 0x3800;
|
||||
bias_b.hi = (fp16_packed_b.hi == 0) ? 0x0 : 0x3800;
|
||||
|
||||
fp16_packed_a.lo = (fp16_packed_a.lo == 0x0200) ? 0x0 : fp16_packed_a.lo;
|
||||
fp16_packed_a.hi = (fp16_packed_a.hi == 0x0200) ? 0x0 : fp16_packed_a.hi;
|
||||
fp16_packed_b.lo = (fp16_packed_b.lo == 0x0200) ? 0x0 : fp16_packed_b.lo;
|
||||
fp16_packed_b.hi = (fp16_packed_b.hi == 0x0200) ? 0x0 : fp16_packed_b.hi;
|
||||
|
||||
sign_a.lo = (fp4x4 << 12) & 0x8000;
|
||||
sign_a.hi = (fp4x4 << 8) & 0x8000;
|
||||
sign_b.lo = (fp4x4 << 4) & 0x8000;
|
||||
sign_b.hi = fp4x4 & 0x8000;
|
||||
|
||||
fp16_packed_a = sign_a + bias_a + fp16_packed_a;
|
||||
fp16_packed_b = sign_b + bias_b + fp16_packed_b;
|
||||
|
||||
return as_half4((ushort4)(fp16_packed_a, fp16_packed_b));
|
||||
}
|
||||
|
||||
static inline float e8m0_to_fp32(uchar x) {
|
||||
int bits;
|
||||
bits = (x == 0) ? 0x00400000 : ((uint) x << 23);
|
||||
return as_float(bits);
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_R0_MXFP4 2 // number of rows each subgroup works on
|
||||
#define N_SG_MXFP4 2 // number of subgroups in a work group
|
||||
#define N_SIMDWIDTH 16 // subgroup size
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_R0_MXFP4 4
|
||||
#define N_SG_MXFP4 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#define SRC0Q_IMG
|
||||
#endif
|
||||
|
||||
kernel void kernel_mul_mv_id_mxfp4_f32_flat(
|
||||
#ifdef SRC0Q_IMG
|
||||
__read_only image1d_buffer_t src0_q,
|
||||
#else
|
||||
global uchar * src0_q,
|
||||
#endif
|
||||
global uchar * src0_e,
|
||||
global uchar * src1,
|
||||
ulong offset1,
|
||||
global uchar * src2,
|
||||
ulong offset2,
|
||||
global uchar * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne20,
|
||||
int ne21,
|
||||
ulong nb21,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
dst = dst + offsetd;
|
||||
|
||||
const int iid1 = get_group_id(2) / ne20;
|
||||
const int idx = get_group_id(2) % ne20;
|
||||
|
||||
uint i02 = ((global uint *) (src2 + offset2 + iid1 * nb21))[idx];
|
||||
|
||||
int i11 = idx % ne11;
|
||||
|
||||
int nb = ne00 / QK_MXFP4;
|
||||
|
||||
uint src0_off = i02*nb02;
|
||||
src0_off /= 17; // 17 = sizeof(block_mxfp4)
|
||||
|
||||
src0_e = src0_e + src0_off;
|
||||
|
||||
dst = dst + (idx * ne0 + iid1 * ne1 * ne0) * sizeof(float);
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
|
||||
int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
|
||||
|
||||
uint offset_src0 = first_row*nb01;
|
||||
offset_src0 /= 17; // 17 = sizeof(block_mxfp4)
|
||||
#ifdef SRC0Q_IMG
|
||||
ulong offset_q = src0_off + offset_src0;
|
||||
#else
|
||||
src0_q = src0_q + src0_off*16;
|
||||
global uchar16 * x_q = (global uchar16 *)(src0_q) + offset_src0;
|
||||
#endif
|
||||
global uchar * x_e = src0_e + offset_src0;
|
||||
|
||||
const short ix = get_sub_group_local_id() >> 1;
|
||||
const short it = get_sub_group_local_id() & 1;
|
||||
|
||||
float sumf[N_R0_MXFP4] = {0.f};
|
||||
|
||||
src1 = src1 + offset1 + i11 * nb11 + iid1 * nb12;
|
||||
global float * y = (global float *) (src1 + r1 * nb11);
|
||||
global float * yb = y + ix * QK_MXFP4 + it * 8;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH / 2) {
|
||||
global float4 * y4 = (global float4 *)yb;
|
||||
|
||||
#pragma unroll
|
||||
for (short row = 0; row < N_R0_MXFP4; row++) {
|
||||
uchar xb_e = x_e[row * nb + ib];
|
||||
#ifdef SRC0Q_IMG
|
||||
ushort4 xb_q = as_ushort4(read_imageui(src0_q, (offset_q + row * nb + ib) * 2 + it).xy);
|
||||
#else
|
||||
ushort4 xb_q = vload4(0, (global ushort *)((global uchar *)(x_q + row * nb + ib) + 8 * it));
|
||||
#endif
|
||||
|
||||
half4 fp16x4_0 = mxfp4_to_fp16_packed(xb_q.s0);
|
||||
half4 fp16x4_1 = mxfp4_to_fp16_packed(xb_q.s1);
|
||||
float4 acc1 = y4[0] * (float4)(fp16x4_0.s0, fp16x4_0.s2, fp16x4_1.s0, fp16x4_1.s2);
|
||||
acc1 += y4[4] * (float4)(fp16x4_0.s1, fp16x4_0.s3, fp16x4_1.s1, fp16x4_1.s3);
|
||||
|
||||
fp16x4_0 = mxfp4_to_fp16_packed(xb_q.s2);
|
||||
fp16x4_1 = mxfp4_to_fp16_packed(xb_q.s3);
|
||||
acc1 += y4[1] * (float4)(fp16x4_0.s0, fp16x4_0.s2, fp16x4_1.s0, fp16x4_1.s2);
|
||||
acc1 += y4[5] * (float4)(fp16x4_0.s1, fp16x4_0.s3, fp16x4_1.s1, fp16x4_1.s3);
|
||||
|
||||
sumf[row] += e8m0_to_fp32(xb_e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
|
||||
}
|
||||
|
||||
yb += (N_SIMDWIDTH / 2) * QK_MXFP4;
|
||||
}
|
||||
|
||||
global float * dst_f32 = (global float *)dst + (ulong)r1 * ne0;
|
||||
|
||||
for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
|
||||
float sum_all = sub_group_reduce_add(sumf[row]);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
dst_f32[first_row + row] = sum_all;
|
||||
}
|
||||
}
|
||||
}
|
||||
167
ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl
Normal file
167
ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl
Normal file
@@ -0,0 +1,167 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK_MXFP4 32
|
||||
|
||||
static inline half4 mxfp4_to_fp16_packed(ushort fp4x4) {
|
||||
ushort2 fp16_packed_a, fp16_packed_b, bias_a, bias_b, sign_a, sign_b;
|
||||
fp16_packed_a.lo = (fp4x4 << 9) & 0x0E00;
|
||||
fp16_packed_a.hi = (fp4x4 << 5) & 0x0E00;
|
||||
fp16_packed_b.lo = (fp4x4 << 1) & 0x0E00;
|
||||
fp16_packed_b.hi = (fp4x4 >> 3) & 0x0E00;
|
||||
|
||||
bias_a.lo = (fp16_packed_a.lo == 0) ? 0x0 : 0x3800;
|
||||
bias_a.hi = (fp16_packed_a.hi == 0) ? 0x0 : 0x3800;
|
||||
bias_b.lo = (fp16_packed_b.lo == 0) ? 0x0 : 0x3800;
|
||||
bias_b.hi = (fp16_packed_b.hi == 0) ? 0x0 : 0x3800;
|
||||
|
||||
fp16_packed_a.lo = (fp16_packed_a.lo == 0x0200) ? 0x0 : fp16_packed_a.lo;
|
||||
fp16_packed_a.hi = (fp16_packed_a.hi == 0x0200) ? 0x0 : fp16_packed_a.hi;
|
||||
fp16_packed_b.lo = (fp16_packed_b.lo == 0x0200) ? 0x0 : fp16_packed_b.lo;
|
||||
fp16_packed_b.hi = (fp16_packed_b.hi == 0x0200) ? 0x0 : fp16_packed_b.hi;
|
||||
|
||||
sign_a.lo = (fp4x4 << 12) & 0x8000;
|
||||
sign_a.hi = (fp4x4 << 8) & 0x8000;
|
||||
sign_b.lo = (fp4x4 << 4) & 0x8000;
|
||||
sign_b.hi = fp4x4 & 0x8000;
|
||||
|
||||
fp16_packed_a = sign_a + bias_a + fp16_packed_a;
|
||||
fp16_packed_b = sign_b + bias_b + fp16_packed_b;
|
||||
|
||||
return as_half4((ushort4)(fp16_packed_a, fp16_packed_b));
|
||||
}
|
||||
|
||||
static inline float e8m0_to_fp32(uchar x) {
|
||||
int bits;
|
||||
bits = (x == 0) ? 0x00400000 : ((uint) x << 23);
|
||||
return as_float(bits);
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_R0_MXFP4 2 // number of rows each subgroup works on
|
||||
#define N_SG_MXFP4 2 // number of subgroups in a work group
|
||||
#define N_SIMDWIDTH 16 // subgroup size
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_R0_MXFP4 2
|
||||
#define N_SG_MXFP4 2
|
||||
#define N_SIMDWIDTH 64
|
||||
#define SRC0Q_IMG
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_mxfp4_f32_flat(
|
||||
#ifdef SRC0Q_IMG
|
||||
__read_only image1d_buffer_t src0_q,
|
||||
#else
|
||||
global uchar * src0_q,
|
||||
#endif
|
||||
global uchar * src0_e,
|
||||
global uchar * src1,
|
||||
ulong offset1,
|
||||
global uchar * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne12,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int nb = ne00 / QK_MXFP4;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
|
||||
int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
|
||||
|
||||
uint i12 = im % ne12;
|
||||
uint i13 = im / ne12;
|
||||
|
||||
uint offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
||||
// 17 = sizeof(block_mxfp4)
|
||||
offset_src0 /= 17;
|
||||
#ifdef SRC0Q_IMG
|
||||
ulong offset_q = offset_src0;
|
||||
#else
|
||||
global uchar16 * x_q = (global uchar16 *)(src0_q) + offset_src0;
|
||||
#endif
|
||||
global uchar * x_e = src0_e + offset_src0;
|
||||
|
||||
ulong offset_src1 = r1 * nb11 + i12 * nb12 + i13 * nb13;
|
||||
global float * y = (global float *)(src1 + offset_src1);
|
||||
|
||||
const short ix = get_sub_group_local_id() >> 1; // 0...15
|
||||
const short it = get_sub_group_local_id() & 1; // 0 or 1
|
||||
|
||||
float sumf[N_R0_MXFP4] = {0.f};
|
||||
|
||||
global float * yb = y + ix * QK_MXFP4 + it * 8;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
global float4 * y4 = (global float4 *)yb;
|
||||
|
||||
#pragma unroll
|
||||
for (short row = 0; row < N_R0_MXFP4; row++) {
|
||||
uchar xb_e = x_e[row * nb + ib];
|
||||
#ifdef SRC0Q_IMG
|
||||
ushort4 xb_q = as_ushort4(read_imageui(src0_q, (offset_q + row * nb + ib) * 2 + it).xy);
|
||||
#else
|
||||
ushort4 xb_q = vload4(0, (global ushort *)((global uchar *)(x_q + row * nb + ib) + 8 * it));
|
||||
#endif
|
||||
|
||||
half4 fp16x4_0 = mxfp4_to_fp16_packed(xb_q.s0);
|
||||
half4 fp16x4_1 = mxfp4_to_fp16_packed(xb_q.s1);
|
||||
float4 acc1 = y4[0] * (float4)(fp16x4_0.s0, fp16x4_0.s2, fp16x4_1.s0, fp16x4_1.s2);
|
||||
acc1 += y4[4] * (float4)(fp16x4_0.s1, fp16x4_0.s3, fp16x4_1.s1, fp16x4_1.s3);
|
||||
|
||||
fp16x4_0 = mxfp4_to_fp16_packed(xb_q.s2);
|
||||
fp16x4_1 = mxfp4_to_fp16_packed(xb_q.s3);
|
||||
acc1 += y4[1] * (float4)(fp16x4_0.s0, fp16x4_0.s2, fp16x4_1.s0, fp16x4_1.s2);
|
||||
acc1 += y4[5] * (float4)(fp16x4_0.s1, fp16x4_0.s3, fp16x4_1.s1, fp16x4_1.s3);
|
||||
|
||||
sumf[row] += e8m0_to_fp32(xb_e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
|
||||
}
|
||||
|
||||
yb += (N_SIMDWIDTH/2) * QK_MXFP4;
|
||||
}
|
||||
|
||||
global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
|
||||
|
||||
for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
|
||||
float sum_all = sub_group_reduce_add(sumf[row]);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
dst_f32[first_row + row] = sum_all;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -795,7 +795,7 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
||||
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||
|
||||
@@ -4073,7 +4073,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
||||
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
||||
/* .event_record = */ ggml_backend_sycl_event_record,
|
||||
/* .event_wait = */ ggml_backend_sycl_event_wait,
|
||||
/* .optimize_graph = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_sycl_guid() {
|
||||
|
||||
@@ -593,7 +593,7 @@ struct vk_device_struct {
|
||||
bool disable_fusion;
|
||||
bool disable_host_visible_vidmem;
|
||||
bool allow_sysmem_fallback;
|
||||
bool disable_optimize_graph;
|
||||
bool disable_graph_optimize;
|
||||
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
std::unique_ptr<vk_memory_logger> memory_logger;
|
||||
@@ -3624,8 +3624,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
const char* GGML_VK_ALLOW_SYSMEM_FALLBACK = getenv("GGML_VK_ALLOW_SYSMEM_FALLBACK");
|
||||
device->allow_sysmem_fallback = GGML_VK_ALLOW_SYSMEM_FALLBACK != nullptr;
|
||||
|
||||
const char* GGML_VK_DISABLE_OPTIMIZE_GRAPH = getenv("GGML_VK_DISABLE_OPTIMIZE_GRAPH");
|
||||
device->disable_optimize_graph = GGML_VK_DISABLE_OPTIMIZE_GRAPH != nullptr;
|
||||
const char* GGML_VK_DISABLE_GRAPH_OPTIMIZE = getenv("GGML_VK_DISABLE_GRAPH_OPTIMIZE");
|
||||
device->disable_graph_optimize = GGML_VK_DISABLE_GRAPH_OPTIMIZE != nullptr;
|
||||
|
||||
bool fp16_storage = false;
|
||||
bool fp16_compute = false;
|
||||
@@ -11914,12 +11914,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
}
|
||||
|
||||
// Sort the graph for improved parallelism.
|
||||
static void ggml_vk_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * graph)
|
||||
static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * graph)
|
||||
{
|
||||
VK_LOG_DEBUG("ggml_vk_optimize_graph(" << graph->n_nodes << " nodes)");
|
||||
VK_LOG_DEBUG("ggml_vk_graph_optimize(" << graph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
if (ctx->device->disable_optimize_graph) {
|
||||
if (ctx->device->disable_graph_optimize) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -12053,7 +12053,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ ggml_vk_optimize_graph,
|
||||
/* .graph_optimize = */ ggml_vk_graph_optimize,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_vk_guid() {
|
||||
|
||||
@@ -823,7 +823,7 @@ static ggml_backend_i ggml_backend_webgpu_i = {
|
||||
/* .graph_compute = */ ggml_backend_webgpu_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
/* End GGML Backend Interface */
|
||||
|
||||
@@ -574,7 +574,7 @@ static ggml_backend_i ggml_backend_zdnn_i = {
|
||||
/* .graph_compute = */ ggml_backend_zdnn_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_zdnn_guid(void) {
|
||||
|
||||
@@ -6507,6 +6507,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_pad());
|
||||
test_cases.emplace_back(new test_pad_ext());
|
||||
test_cases.emplace_back(new test_pad_reflect_1d());
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
|
||||
test_cases.emplace_back(new test_roll());
|
||||
test_cases.emplace_back(new test_arange());
|
||||
test_cases.emplace_back(new test_timestep_embedding());
|
||||
@@ -6628,9 +6629,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {8192, 512, 2, 1}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {8192, 512, 2, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
|
||||
@@ -6645,6 +6648,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
|
||||
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {512, 34, 2, 1}));
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 80, 1, 1}));
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 80, 4, 1}));
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 1, 1}));
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, true));
|
||||
|
||||
|
||||
@@ -1402,6 +1402,12 @@ static void test_template_output_parsers() {
|
||||
"Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_GRANITE}));
|
||||
assert_msg_equals(
|
||||
message_assist,
|
||||
common_chat_parse(
|
||||
"Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ true,
|
||||
{COMMON_CHAT_FORMAT_GRANITE}));
|
||||
|
||||
// Test parsing content with thinking
|
||||
assert_msg_equals(message_assist_thoughts,
|
||||
@@ -1412,6 +1418,59 @@ static void test_template_output_parsers() {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_GRANITE,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
}));
|
||||
assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_GRANITE}));
|
||||
assert_msg_equals(message_assist_thoughts,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ true,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_GRANITE,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
}));
|
||||
assert_msg_equals(message_assist_thoughts,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?</response>",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_GRANITE,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
}));
|
||||
assert_msg_equals(simple_assist_msg("<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?</response>"),
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?</response>",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_GRANITE}));
|
||||
assert_msg_equals(message_assist_empty,
|
||||
common_chat_parse(
|
||||
"<think",
|
||||
/* is_partial= */ true,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_GRANITE,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
}));
|
||||
assert_msg_equals(message_assist_empty,
|
||||
common_chat_parse(
|
||||
"<think",
|
||||
/* is_partial= */ true,
|
||||
{COMMON_CHAT_FORMAT_GRANITE}));
|
||||
assert_msg_equals(message_assist_thoughts_no_content,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking",
|
||||
/* is_partial= */ true,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_GRANITE,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
}));
|
||||
assert_msg_equals(
|
||||
message_assist_empty,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think><response",
|
||||
/* is_partial= */ true,
|
||||
{COMMON_CHAT_FORMAT_GRANITE}));
|
||||
|
||||
// Test parsing tool calls
|
||||
assert_msg_equals(message_assist_call,
|
||||
@@ -1419,6 +1478,38 @@ static void test_template_output_parsers() {
|
||||
"<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_GRANITE}));
|
||||
assert_msg_equals(
|
||||
message_assist_call_empty_args,
|
||||
common_chat_parse(
|
||||
"<|tool_call|>[{\"name\": \"special_function\"",
|
||||
/* is_partial= */ true,
|
||||
{COMMON_CHAT_FORMAT_GRANITE}));
|
||||
assert_msg_equals(
|
||||
message_assist_call_cutoff_args,
|
||||
common_chat_parse(
|
||||
"<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg",
|
||||
/* is_partial= */ true,
|
||||
{COMMON_CHAT_FORMAT_GRANITE}));
|
||||
assert_msg_equals(
|
||||
message_assist_call_cutoff_args,
|
||||
common_chat_parse(
|
||||
"<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg",
|
||||
/* is_partial= */ true,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_GRANITE,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
}));
|
||||
|
||||
// Test parsing tool calls with thinking
|
||||
assert_msg_equals(
|
||||
message_assist_call_thoughts,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think><|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, {",
|
||||
/* is_partial= */ true,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_GRANITE,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
}));
|
||||
|
||||
// Test template generation for regular content
|
||||
test_templates(tmpls.get(), end_tokens, message_assist, tools,
|
||||
|
||||
@@ -30,8 +30,10 @@ options:
|
||||
--delay <0...N> (seconds) delay between each test (default: 0)
|
||||
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
|
||||
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
|
||||
--list-devices list available devices and exit
|
||||
-v, --verbose verbose output
|
||||
--progress print test progress indicators
|
||||
-rpc, --rpc <rpc_servers> register RPC devices (comma separated)
|
||||
|
||||
test parameters:
|
||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||
@@ -48,11 +50,12 @@ test parameters:
|
||||
--cpu-strict <0|1> (default: 0)
|
||||
--poll <0...100> (default: 50)
|
||||
-ngl, --n-gpu-layers <n> (default: 99)
|
||||
-rpc, --rpc <rpc_servers> (default: none)
|
||||
-ncmoe, --n-cpu-moe <n> (default: 0)
|
||||
-sm, --split-mode <none|layer|row> (default: layer)
|
||||
-mg, --main-gpu <i> (default: 0)
|
||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||
-fa, --flash-attn <0|1> (default: 0)
|
||||
-dev, --device <dev0/dev1/...> (default: auto)
|
||||
-mmp, --mmap <0|1> (default: 1)
|
||||
-embd, --embeddings <0|1> (default: 0)
|
||||
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
@@ -135,6 +136,101 @@ static std::string get_gpu_info() {
|
||||
return join(gpu_list, ", ");
|
||||
}
|
||||
|
||||
static std::vector<ggml_backend_dev_t> parse_devices_arg(const std::string & value) {
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
std::string trimmed = string_strip(value);
|
||||
if (trimmed.empty()) {
|
||||
throw std::invalid_argument("no devices specified");
|
||||
}
|
||||
if (trimmed == "auto") {
|
||||
return devices;
|
||||
}
|
||||
|
||||
auto dev_names = string_split<std::string>(trimmed, '/');
|
||||
if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") {
|
||||
devices.push_back(nullptr);
|
||||
return devices;
|
||||
}
|
||||
|
||||
for (auto & name : dev_names) {
|
||||
std::string dev_name = string_strip(name);
|
||||
if (dev_name.empty()) {
|
||||
throw std::invalid_argument("invalid device specification");
|
||||
}
|
||||
auto * dev = ggml_backend_dev_by_name(dev_name.c_str());
|
||||
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str()));
|
||||
}
|
||||
devices.push_back(dev);
|
||||
}
|
||||
|
||||
devices.push_back(nullptr);
|
||||
return devices;
|
||||
}
|
||||
|
||||
static std::vector<ggml_backend_dev_t> register_rpc_device_list(const std::string & servers) {
|
||||
auto rpc_servers = string_split<std::string>(servers, ',');
|
||||
if (rpc_servers.empty()) {
|
||||
throw std::invalid_argument("no RPC servers specified");
|
||||
}
|
||||
|
||||
auto * rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||
if (!rpc_reg) {
|
||||
throw std::invalid_argument("failed to find RPC backend");
|
||||
}
|
||||
|
||||
using add_rpc_device_fn = ggml_backend_dev_t (*)(const char * endpoint);
|
||||
auto * ggml_backend_rpc_add_device_fn = (add_rpc_device_fn) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||
if (!ggml_backend_rpc_add_device_fn) {
|
||||
throw std::invalid_argument("failed to find RPC device add function");
|
||||
}
|
||||
|
||||
static std::unordered_set<std::string> registered;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
for (const auto & server : rpc_servers) {
|
||||
ggml_backend_dev_t dev = nullptr;
|
||||
|
||||
std::string name = string_format("RPC[%s]", server.c_str());
|
||||
|
||||
if (registered.find(server) != registered.end()) {
|
||||
dev = ggml_backend_dev_by_name(name.c_str());
|
||||
}
|
||||
|
||||
if (!dev) {
|
||||
dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
||||
if (!dev) {
|
||||
throw std::invalid_argument(string_format("failed to add RPC device for server '%s'", server.c_str()));
|
||||
}
|
||||
ggml_backend_device_register(dev);
|
||||
registered.insert(server);
|
||||
}
|
||||
|
||||
devices.push_back(dev);
|
||||
}
|
||||
|
||||
return devices;
|
||||
}
|
||||
|
||||
static std::string devices_to_string(const std::vector<ggml_backend_dev_t> & devices) {
|
||||
if (devices.empty()) {
|
||||
return "auto";
|
||||
}
|
||||
|
||||
if (devices.size() == 1 && devices[0] == nullptr) {
|
||||
return "none";
|
||||
}
|
||||
|
||||
std::vector<std::string> names;
|
||||
for (auto * dev : devices) {
|
||||
if (dev == nullptr) {
|
||||
break;
|
||||
}
|
||||
names.push_back(ggml_backend_dev_name(dev));
|
||||
}
|
||||
|
||||
return join(names, "/");
|
||||
}
|
||||
|
||||
// command line params
|
||||
enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
|
||||
|
||||
@@ -251,11 +347,11 @@ struct cmd_params {
|
||||
std::vector<int> poll;
|
||||
std::vector<int> n_gpu_layers;
|
||||
std::vector<int> n_cpu_moe;
|
||||
std::vector<std::string> rpc_servers;
|
||||
std::vector<llama_split_mode> split_mode;
|
||||
std::vector<int> main_gpu;
|
||||
std::vector<bool> no_kv_offload;
|
||||
std::vector<bool> flash_attn;
|
||||
std::vector<std::vector<ggml_backend_dev_t>> devices;
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
||||
std::vector<bool> use_mmap;
|
||||
@@ -288,11 +384,11 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* poll */ { 50 },
|
||||
/* n_gpu_layers */ { 99 },
|
||||
/* n_cpu_moe */ { 0 },
|
||||
/* rpc_servers */ { "" },
|
||||
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
|
||||
/* main_gpu */ { 0 },
|
||||
/* no_kv_offload */ { false },
|
||||
/* flash_attn */ { false },
|
||||
/* devices */ { {} },
|
||||
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
||||
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
||||
/* use_mmap */ { true },
|
||||
@@ -325,9 +421,13 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
|
||||
output_format_str(cmd_params_defaults.output_format_stderr));
|
||||
printf(" --list-devices list available devices and exit\n");
|
||||
printf(" -v, --verbose verbose output\n");
|
||||
printf(" --progress print test progress indicators\n");
|
||||
printf(" --no-warmup skip warmup runs before benchmarking\n");
|
||||
if (llama_supports_rpc()) {
|
||||
printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
|
||||
}
|
||||
printf("\n");
|
||||
printf("test parameters:\n");
|
||||
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||
@@ -357,10 +457,6 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
|
||||
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
|
||||
if (llama_supports_rpc()) {
|
||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
|
||||
join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||
}
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
|
||||
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n",
|
||||
@@ -369,6 +465,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n",
|
||||
join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n",
|
||||
join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
||||
@@ -533,6 +630,42 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
||||
} else if (arg == "-dev" || arg == "--device") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto combos = string_split<std::string>(argv[i], split_delim);
|
||||
for (const auto & combo : combos) {
|
||||
try {
|
||||
params.devices.push_back(parse_devices_arg(combo));
|
||||
} catch (const std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (invalid_param) {
|
||||
break;
|
||||
}
|
||||
} else if (arg == "--list-devices") {
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
devices.push_back(dev);
|
||||
}
|
||||
}
|
||||
printf("Available devices:\n");
|
||||
if (devices.empty()) {
|
||||
printf(" (none)\n");
|
||||
}
|
||||
for (auto * dev : devices) {
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||
}
|
||||
exit(0);
|
||||
} else if (arg == "-t" || arg == "--threads") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -580,7 +713,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rpc_servers.push_back(argv[i]);
|
||||
try {
|
||||
register_rpc_device_list(argv[i]);
|
||||
} catch (const std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else if (arg == "-sm" || arg == "--split-mode") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -855,9 +994,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.n_cpu_moe.empty()) {
|
||||
params.n_cpu_moe = cmd_params_defaults.n_cpu_moe;
|
||||
}
|
||||
if (params.rpc_servers.empty()) {
|
||||
params.rpc_servers = cmd_params_defaults.rpc_servers;
|
||||
}
|
||||
if (params.split_mode.empty()) {
|
||||
params.split_mode = cmd_params_defaults.split_mode;
|
||||
}
|
||||
@@ -870,6 +1006,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.flash_attn.empty()) {
|
||||
params.flash_attn = cmd_params_defaults.flash_attn;
|
||||
}
|
||||
if (params.devices.empty()) {
|
||||
params.devices = cmd_params_defaults.devices;
|
||||
}
|
||||
if (params.tensor_split.empty()) {
|
||||
params.tensor_split = cmd_params_defaults.tensor_split;
|
||||
}
|
||||
@@ -916,11 +1055,11 @@ struct cmd_params_instance {
|
||||
int poll;
|
||||
int n_gpu_layers;
|
||||
int n_cpu_moe;
|
||||
std::string rpc_servers_str;
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
std::vector<float> tensor_split;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
bool use_mmap;
|
||||
@@ -931,57 +1070,8 @@ struct cmd_params_instance {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
|
||||
mparams.n_gpu_layers = n_gpu_layers;
|
||||
if (!rpc_servers_str.empty()) {
|
||||
auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
|
||||
|
||||
// add RPC devices
|
||||
if (!rpc_servers.empty()) {
|
||||
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||
if (!rpc_reg) {
|
||||
fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
|
||||
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||
if (!ggml_backend_rpc_add_device_fn) {
|
||||
fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
static std::vector<ggml_backend_dev_t> devices;
|
||||
devices.clear();
|
||||
// RPC devices should always come first for performance reasons
|
||||
for (const std::string & server : rpc_servers) {
|
||||
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
||||
if (dev) {
|
||||
devices.push_back(dev);
|
||||
} else {
|
||||
fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
// FIXME: use llama.cpp device selection logic
|
||||
// add local GPU devices if any
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
switch (ggml_backend_dev_type(dev)) {
|
||||
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
// skip CPU backends since they are handled separately
|
||||
break;
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
devices.push_back(dev);
|
||||
break;
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||
// iGPUs are not used when there are RPC servers
|
||||
break;
|
||||
}
|
||||
}
|
||||
devices.push_back(nullptr);
|
||||
mparams.devices = devices.data();
|
||||
}
|
||||
if (!devices.empty()) {
|
||||
mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
|
||||
}
|
||||
mparams.split_mode = split_mode;
|
||||
mparams.main_gpu = main_gpu;
|
||||
@@ -1029,8 +1119,9 @@ struct cmd_params_instance {
|
||||
|
||||
bool equal_mparams(const cmd_params_instance & other) const {
|
||||
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
|
||||
rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode &&
|
||||
split_mode == other.split_mode &&
|
||||
main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
|
||||
devices == other.devices &&
|
||||
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
|
||||
}
|
||||
|
||||
@@ -1060,9 +1151,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & m : params.model)
|
||||
for (const auto & nl : params.n_gpu_layers)
|
||||
for (const auto & ncmoe : params.n_cpu_moe)
|
||||
for (const auto & rpc : params.rpc_servers)
|
||||
for (const auto & sm : params.split_mode)
|
||||
for (const auto & mg : params.main_gpu)
|
||||
for (const auto & devs : params.devices)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & ot : params.tensor_buft_overrides)
|
||||
for (const auto & mmp : params.use_mmap)
|
||||
@@ -1098,11 +1189,11 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .n_cpu_moe = */ ncmoe,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .devices = */ devs,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
@@ -1131,11 +1222,11 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .n_cpu_moe = */ ncmoe,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .devices = */ devs,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
@@ -1164,11 +1255,11 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .n_cpu_moe = */ ncmoe,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .devices = */ devs,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
@@ -1206,6 +1297,7 @@ struct test {
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
std::vector<float> tensor_split;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
bool use_mmap;
|
||||
@@ -1241,6 +1333,7 @@ struct test {
|
||||
main_gpu = inst.main_gpu;
|
||||
no_kv_offload = inst.no_kv_offload;
|
||||
flash_attn = inst.flash_attn;
|
||||
devices = inst.devices;
|
||||
tensor_split = inst.tensor_split;
|
||||
tensor_buft_overrides = inst.tensor_buft_overrides;
|
||||
use_mmap = inst.use_mmap;
|
||||
@@ -1287,14 +1380,14 @@ struct test {
|
||||
|
||||
static const std::vector<std::string> & get_fields() {
|
||||
static const std::vector<std::string> fields = {
|
||||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends",
|
||||
"model_filename", "model_type", "model_size", "model_n_params", "n_batch",
|
||||
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
||||
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen",
|
||||
"n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts",
|
||||
"stddev_ts"
|
||||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends",
|
||||
"model_filename", "model_type", "model_size", "model_n_params", "n_batch",
|
||||
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
|
||||
"tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload",
|
||||
"n_prompt", "n_gen", "n_depth", "test_time", "avg_ns",
|
||||
"stddev_ns", "avg_ts", "stddev_ts"
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
@@ -1378,6 +1471,7 @@ struct test {
|
||||
std::to_string(main_gpu),
|
||||
std::to_string(no_kv_offload),
|
||||
std::to_string(flash_attn),
|
||||
devices_to_string(devices),
|
||||
tensor_split_str,
|
||||
tensor_buft_overrides_str,
|
||||
std::to_string(use_mmap),
|
||||
@@ -1559,6 +1653,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "flash_attn") {
|
||||
return 2;
|
||||
}
|
||||
if (field == "devices") {
|
||||
return -12;
|
||||
}
|
||||
if (field == "use_mmap") {
|
||||
return 4;
|
||||
}
|
||||
@@ -1602,6 +1699,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "no_op_offload") {
|
||||
return "nopo";
|
||||
}
|
||||
if (field == "devices") {
|
||||
return "dev";
|
||||
}
|
||||
if (field == "tensor_split") {
|
||||
return "ts";
|
||||
}
|
||||
@@ -1661,6 +1761,9 @@ struct markdown_printer : public printer {
|
||||
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
|
||||
fields.emplace_back("flash_attn");
|
||||
}
|
||||
if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) {
|
||||
fields.emplace_back("devices");
|
||||
}
|
||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||
fields.emplace_back("tensor_split");
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -1,7 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to install pre-commit and post-commit hooks for webui
|
||||
# Pre-commit: formats code and builds, stashes unstaged changes
|
||||
# Pre-commit: formats, lints, checks, and builds code, stashes unstaged changes
|
||||
# Post-commit: automatically unstashes changes
|
||||
|
||||
REPO_ROOT=$(git rev-parse --show-toplevel)
|
||||
@@ -44,6 +44,18 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run the lint command
|
||||
npm run lint
|
||||
|
||||
# Check if lint command succeeded
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: npm run lint failed"
|
||||
if [ $STASH_CREATED -eq 0 ]; then
|
||||
echo "You can restore your unstaged changes with: git stash pop"
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run the check command
|
||||
npm run check
|
||||
|
||||
@@ -112,7 +124,7 @@ if [ $? -eq 0 ]; then
|
||||
echo " Post-commit: $POST_COMMIT_HOOK"
|
||||
echo ""
|
||||
echo "The hooks will automatically:"
|
||||
echo " • Format and build webui code before commits"
|
||||
echo " • Format, lint, check, and build webui code before commits"
|
||||
echo " • Stash unstaged changes during the process"
|
||||
echo " • Restore your unstaged changes after the commit"
|
||||
echo ""
|
||||
|
||||
@@ -121,3 +121,15 @@
|
||||
@apply bg-background text-foreground;
|
||||
}
|
||||
}
|
||||
|
||||
@layer utilities {
|
||||
.scrollbar-hide {
|
||||
/* Hide scrollbar for Chrome, Safari and Opera */
|
||||
&::-webkit-scrollbar {
|
||||
display: none;
|
||||
}
|
||||
/* Hide scrollbar for IE, Edge and Firefox */
|
||||
-ms-overflow-style: none;
|
||||
scrollbar-width: none;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,20 @@
|
||||
<script lang="ts">
|
||||
import { Settings, Funnel, AlertTriangle, Brain, Cog, Monitor, Sun, Moon } from '@lucide/svelte';
|
||||
import { ChatSettingsFooter, ChatSettingsSection } from '$lib/components/app';
|
||||
import { Checkbox } from '$lib/components/ui/checkbox';
|
||||
import {
|
||||
Settings,
|
||||
Funnel,
|
||||
AlertTriangle,
|
||||
Brain,
|
||||
Cog,
|
||||
Monitor,
|
||||
Sun,
|
||||
Moon,
|
||||
ChevronLeft,
|
||||
ChevronRight
|
||||
} from '@lucide/svelte';
|
||||
import { ChatSettingsFooter, ChatSettingsFields } from '$lib/components/app';
|
||||
import * as Dialog from '$lib/components/ui/dialog';
|
||||
import { Input } from '$lib/components/ui/input';
|
||||
import Label from '$lib/components/ui/label/label.svelte';
|
||||
import { ScrollArea } from '$lib/components/ui/scroll-area';
|
||||
import * as Select from '$lib/components/ui/select';
|
||||
import { Textarea } from '$lib/components/ui/textarea';
|
||||
import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO } from '$lib/constants/settings-config';
|
||||
import { supportsVision } from '$lib/stores/server.svelte';
|
||||
import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
|
||||
import { config, updateMultipleConfig, resetConfig } from '$lib/stores/settings.svelte';
|
||||
import { setMode } from 'mode-watcher';
|
||||
import type { Component } from 'svelte';
|
||||
@@ -224,12 +229,20 @@
|
||||
let localConfig: SettingsConfigType = $state({ ...config() });
|
||||
let originalTheme: string = $state('');
|
||||
|
||||
let canScrollLeft = $state(false);
|
||||
let canScrollRight = $state(false);
|
||||
let scrollContainer: HTMLDivElement | undefined = $state();
|
||||
|
||||
function handleThemeChange(newTheme: string) {
|
||||
localConfig.theme = newTheme;
|
||||
|
||||
setMode(newTheme as 'light' | 'dark' | 'system');
|
||||
}
|
||||
|
||||
function handleConfigChange(key: string, value: string | boolean) {
|
||||
localConfig[key] = value;
|
||||
}
|
||||
|
||||
function handleClose() {
|
||||
if (localConfig.theme !== originalTheme) {
|
||||
setMode(originalTheme as 'light' | 'dark' | 'system');
|
||||
@@ -298,18 +311,63 @@
|
||||
onOpenChange?.(false);
|
||||
}
|
||||
|
||||
function scrollToCenter(element: HTMLElement) {
|
||||
if (!scrollContainer) return;
|
||||
|
||||
const containerRect = scrollContainer.getBoundingClientRect();
|
||||
const elementRect = element.getBoundingClientRect();
|
||||
|
||||
const elementCenter = elementRect.left + elementRect.width / 2;
|
||||
const containerCenter = containerRect.left + containerRect.width / 2;
|
||||
const scrollOffset = elementCenter - containerCenter;
|
||||
|
||||
scrollContainer.scrollBy({ left: scrollOffset, behavior: 'smooth' });
|
||||
}
|
||||
|
||||
function scrollLeft() {
|
||||
if (!scrollContainer) return;
|
||||
|
||||
scrollContainer.scrollBy({ left: -250, behavior: 'smooth' });
|
||||
}
|
||||
|
||||
function scrollRight() {
|
||||
if (!scrollContainer) return;
|
||||
|
||||
scrollContainer.scrollBy({ left: 250, behavior: 'smooth' });
|
||||
}
|
||||
|
||||
function updateScrollButtons() {
|
||||
if (!scrollContainer) return;
|
||||
|
||||
const { scrollLeft, scrollWidth, clientWidth } = scrollContainer;
|
||||
canScrollLeft = scrollLeft > 0;
|
||||
canScrollRight = scrollLeft < scrollWidth - clientWidth - 1; // -1 for rounding
|
||||
}
|
||||
|
||||
$effect(() => {
|
||||
if (open) {
|
||||
localConfig = { ...config() };
|
||||
originalTheme = config().theme as string;
|
||||
|
||||
setTimeout(updateScrollButtons, 100);
|
||||
}
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
if (scrollContainer) {
|
||||
updateScrollButtons();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
|
||||
<Dialog.Root {open} onOpenChange={handleClose}>
|
||||
<Dialog.Content class="flex h-[64vh] flex-col gap-0 p-0" style="max-width: 48rem;">
|
||||
<div class="flex flex-1 overflow-hidden">
|
||||
<div class="w-64 border-r border-border/30 p-6">
|
||||
<Dialog.Content
|
||||
class="z-999999 flex h-[100vh] flex-col gap-0 rounded-none p-0 md:h-[64vh] md:rounded-lg"
|
||||
style="max-width: 48rem;"
|
||||
>
|
||||
<div class="flex flex-1 flex-col overflow-hidden md:flex-row">
|
||||
<!-- Desktop Sidebar -->
|
||||
<div class="hidden w-64 border-r border-border/30 p-6 md:block">
|
||||
<nav class="space-y-1 py-2">
|
||||
<Dialog.Title class="mb-6 flex items-center gap-2">Settings</Dialog.Title>
|
||||
|
||||
@@ -329,134 +387,79 @@
|
||||
</nav>
|
||||
</div>
|
||||
|
||||
<ScrollArea class="flex-1">
|
||||
<div class="space-y-6 p-6">
|
||||
<ChatSettingsSection title={currentSection.title} Icon={currentSection.icon}>
|
||||
{#each currentSection.fields as field (field.key)}
|
||||
<div class="space-y-2">
|
||||
{#if field.type === 'input'}
|
||||
<Label for={field.key} class="block text-sm font-medium">
|
||||
{field.label}
|
||||
</Label>
|
||||
<!-- Mobile Header with Horizontal Scrollable Menu -->
|
||||
<div class="flex flex-col md:hidden">
|
||||
<div class="border-b border-border/30 py-4">
|
||||
<Dialog.Title class="mb-6 flex items-center gap-2 px-4">Settings</Dialog.Title>
|
||||
|
||||
<Input
|
||||
id={field.key}
|
||||
value={String(localConfig[field.key] || '')}
|
||||
onchange={(e) => (localConfig[field.key] = e.currentTarget.value)}
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] || 'none'}`}
|
||||
class="max-w-md"
|
||||
/>
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="mt-1 text-xs text-muted-foreground">
|
||||
{field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
</p>
|
||||
{/if}
|
||||
{:else if field.type === 'textarea'}
|
||||
<Label for={field.key} class="block text-sm font-medium">
|
||||
{field.label}
|
||||
</Label>
|
||||
<!-- Horizontal Scrollable Category Menu with Navigation -->
|
||||
<div class="relative flex items-center" style="scroll-padding: 1rem;">
|
||||
<button
|
||||
class="absolute left-2 z-10 flex h-6 w-6 items-center justify-center rounded-full bg-muted shadow-md backdrop-blur-sm transition-opacity hover:bg-accent {canScrollLeft
|
||||
? 'opacity-100'
|
||||
: 'pointer-events-none opacity-0'}"
|
||||
onclick={scrollLeft}
|
||||
aria-label="Scroll left"
|
||||
>
|
||||
<ChevronLeft class="h-4 w-4" />
|
||||
</button>
|
||||
|
||||
<Textarea
|
||||
id={field.key}
|
||||
value={String(localConfig[field.key] || '')}
|
||||
onchange={(e) => (localConfig[field.key] = e.currentTarget.value)}
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] || 'none'}`}
|
||||
class="min-h-[100px] max-w-2xl"
|
||||
/>
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="mt-1 text-xs text-muted-foreground">
|
||||
{field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
</p>
|
||||
{/if}
|
||||
{:else if field.type === 'select'}
|
||||
{@const selectedOption = field.options?.find(
|
||||
(opt: { value: string; label: string; icon?: Component }) =>
|
||||
opt.value === localConfig[field.key]
|
||||
)}
|
||||
|
||||
<Label for={field.key} class="block text-sm font-medium">
|
||||
{field.label}
|
||||
</Label>
|
||||
|
||||
<Select.Root
|
||||
type="single"
|
||||
value={localConfig[field.key]}
|
||||
onValueChange={(value) => {
|
||||
if (field.key === 'theme' && value) {
|
||||
handleThemeChange(value);
|
||||
} else {
|
||||
localConfig[field.key] = value;
|
||||
}
|
||||
<div
|
||||
class="scrollbar-hide overflow-x-auto py-2"
|
||||
bind:this={scrollContainer}
|
||||
onscroll={updateScrollButtons}
|
||||
>
|
||||
<div class="flex min-w-max gap-2">
|
||||
{#each settingSections as section (section.title)}
|
||||
<button
|
||||
class="flex cursor-pointer items-center gap-2 rounded-lg px-3 py-2 text-sm whitespace-nowrap transition-colors first:ml-4 last:mr-4 hover:bg-accent {activeSection ===
|
||||
section.title
|
||||
? 'bg-accent text-accent-foreground'
|
||||
: 'text-muted-foreground'}"
|
||||
onclick={(e: MouseEvent) => {
|
||||
activeSection = section.title;
|
||||
scrollToCenter(e.currentTarget as HTMLElement);
|
||||
}}
|
||||
>
|
||||
<Select.Trigger class="max-w-md">
|
||||
<div class="flex items-center gap-2">
|
||||
{#if selectedOption?.icon}
|
||||
{@const IconComponent = selectedOption.icon}
|
||||
<IconComponent class="h-4 w-4" />
|
||||
{/if}
|
||||
|
||||
{selectedOption?.label || `Select ${field.label.toLowerCase()}`}
|
||||
</div>
|
||||
</Select.Trigger>
|
||||
<Select.Content>
|
||||
{#if field.options}
|
||||
{#each field.options as option (option.value)}
|
||||
<Select.Item value={option.value} label={option.label}>
|
||||
<div class="flex items-center gap-2">
|
||||
{#if option.icon}
|
||||
{@const IconComponent = option.icon}
|
||||
<IconComponent class="h-4 w-4" />
|
||||
{/if}
|
||||
{option.label}
|
||||
</div>
|
||||
</Select.Item>
|
||||
{/each}
|
||||
{/if}
|
||||
</Select.Content>
|
||||
</Select.Root>
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="mt-1 text-xs text-muted-foreground">
|
||||
{field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
</p>
|
||||
{/if}
|
||||
{:else if field.type === 'checkbox'}
|
||||
{@const isDisabled = field.key === 'pdfAsImage' && !supportsVision()}
|
||||
<div class="flex items-start space-x-3">
|
||||
<Checkbox
|
||||
id={field.key}
|
||||
checked={Boolean(localConfig[field.key])}
|
||||
disabled={isDisabled}
|
||||
onCheckedChange={(checked) => (localConfig[field.key] = checked)}
|
||||
class="mt-1"
|
||||
/>
|
||||
|
||||
<div class="space-y-1">
|
||||
<label
|
||||
for={field.key}
|
||||
class="cursor-pointer text-sm leading-none font-medium {isDisabled
|
||||
? 'text-muted-foreground'
|
||||
: ''}"
|
||||
>
|
||||
{field.label}
|
||||
</label>
|
||||
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="text-xs text-muted-foreground">
|
||||
{field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
</p>
|
||||
{:else if field.key === 'pdfAsImage' && !supportsVision()}
|
||||
<p class="text-xs text-muted-foreground">
|
||||
PDF-to-image processing requires a vision-capable model. PDFs will be
|
||||
processed as text.
|
||||
</p>
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
<section.icon class="h-4 w-4 flex-shrink-0" />
|
||||
<span>{section.title}</span>
|
||||
</button>
|
||||
{/each}
|
||||
</div>
|
||||
{/each}
|
||||
</ChatSettingsSection>
|
||||
</div>
|
||||
|
||||
<button
|
||||
class="absolute right-2 z-10 flex h-6 w-6 items-center justify-center rounded-full bg-muted shadow-md backdrop-blur-sm transition-opacity hover:bg-accent {canScrollRight
|
||||
? 'opacity-100'
|
||||
: 'pointer-events-none opacity-0'}"
|
||||
onclick={scrollRight}
|
||||
aria-label="Scroll right"
|
||||
>
|
||||
<ChevronRight class="h-4 w-4" />
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<ScrollArea class="max-h-[calc(100vh-13.5rem)] flex-1">
|
||||
<div class="space-y-6 p-4 md:p-6">
|
||||
<div>
|
||||
<div class="mb-6 flex hidden items-center gap-2 border-b border-border/30 pb-6 md:flex">
|
||||
<currentSection.icon class="h-5 w-5" />
|
||||
|
||||
<h3 class="text-lg font-semibold">{currentSection.title}</h3>
|
||||
</div>
|
||||
|
||||
<div class="space-y-6">
|
||||
<ChatSettingsFields
|
||||
fields={currentSection.fields}
|
||||
{localConfig}
|
||||
onConfigChange={handleConfigChange}
|
||||
onThemeChange={handleThemeChange}
|
||||
isMobile={false}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="mt-8 border-t pt-6">
|
||||
<p class="text-xs text-muted-foreground">
|
||||
@@ -467,6 +470,6 @@
|
||||
</ScrollArea>
|
||||
</div>
|
||||
|
||||
<ChatSettingsFooter onClose={handleClose} onReset={handleReset} onSave={handleSave} />
|
||||
<ChatSettingsFooter onReset={handleReset} onSave={handleSave} />
|
||||
</Dialog.Content>
|
||||
</Dialog.Root>
|
||||
|
||||
@@ -0,0 +1,145 @@
|
||||
<script lang="ts">
|
||||
import { Checkbox } from '$lib/components/ui/checkbox';
|
||||
import { Input } from '$lib/components/ui/input';
|
||||
import Label from '$lib/components/ui/label/label.svelte';
|
||||
import * as Select from '$lib/components/ui/select';
|
||||
import { Textarea } from '$lib/components/ui/textarea';
|
||||
import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO } from '$lib/constants/settings-config';
|
||||
import { supportsVision } from '$lib/stores/server.svelte';
|
||||
import type { Component } from 'svelte';
|
||||
|
||||
interface Props {
|
||||
fields: SettingsFieldConfig[];
|
||||
localConfig: SettingsConfigType;
|
||||
onConfigChange: (key: string, value: string | boolean) => void;
|
||||
onThemeChange?: (theme: string) => void;
|
||||
isMobile?: boolean;
|
||||
}
|
||||
|
||||
let { fields, localConfig, onConfigChange, onThemeChange, isMobile = false }: Props = $props();
|
||||
</script>
|
||||
|
||||
{#each fields as field (field.key)}
|
||||
<div class="space-y-2">
|
||||
{#if field.type === 'input'}
|
||||
<Label for={field.key} class="block text-sm font-medium">
|
||||
{field.label}
|
||||
</Label>
|
||||
|
||||
<Input
|
||||
id={field.key}
|
||||
value={String(localConfig[field.key] || '')}
|
||||
onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] || 'none'}`}
|
||||
class={isMobile ? 'w-full' : 'max-w-md'}
|
||||
/>
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="mt-1 text-xs text-muted-foreground">
|
||||
{field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
</p>
|
||||
{/if}
|
||||
{:else if field.type === 'textarea'}
|
||||
<Label for={field.key} class="block text-sm font-medium">
|
||||
{field.label}
|
||||
</Label>
|
||||
|
||||
<Textarea
|
||||
id={field.key}
|
||||
value={String(localConfig[field.key] || '')}
|
||||
onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] || 'none'}`}
|
||||
class={isMobile ? 'min-h-[100px] w-full' : 'min-h-[100px] max-w-2xl'}
|
||||
/>
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="mt-1 text-xs text-muted-foreground">
|
||||
{field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
</p>
|
||||
{/if}
|
||||
{:else if field.type === 'select'}
|
||||
{@const selectedOption = field.options?.find(
|
||||
(opt: { value: string; label: string; icon?: Component }) =>
|
||||
opt.value === localConfig[field.key]
|
||||
)}
|
||||
|
||||
<Label for={field.key} class="block text-sm font-medium">
|
||||
{field.label}
|
||||
</Label>
|
||||
|
||||
<Select.Root
|
||||
type="single"
|
||||
value={localConfig[field.key]}
|
||||
onValueChange={(value) => {
|
||||
if (field.key === 'theme' && value && onThemeChange) {
|
||||
onThemeChange(value);
|
||||
} else {
|
||||
onConfigChange(field.key, value);
|
||||
}
|
||||
}}
|
||||
>
|
||||
<Select.Trigger class={isMobile ? 'w-full' : 'max-w-md'}>
|
||||
<div class="flex items-center gap-2">
|
||||
{#if selectedOption?.icon}
|
||||
{@const IconComponent = selectedOption.icon}
|
||||
<IconComponent class="h-4 w-4" />
|
||||
{/if}
|
||||
|
||||
{selectedOption?.label || `Select ${field.label.toLowerCase()}`}
|
||||
</div>
|
||||
</Select.Trigger>
|
||||
<Select.Content>
|
||||
{#if field.options}
|
||||
{#each field.options as option (option.value)}
|
||||
<Select.Item value={option.value} label={option.label}>
|
||||
<div class="flex items-center gap-2">
|
||||
{#if option.icon}
|
||||
{@const IconComponent = option.icon}
|
||||
<IconComponent class="h-4 w-4" />
|
||||
{/if}
|
||||
{option.label}
|
||||
</div>
|
||||
</Select.Item>
|
||||
{/each}
|
||||
{/if}
|
||||
</Select.Content>
|
||||
</Select.Root>
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="mt-1 text-xs text-muted-foreground">
|
||||
{field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
</p>
|
||||
{/if}
|
||||
{:else if field.type === 'checkbox'}
|
||||
{@const isDisabled = field.key === 'pdfAsImage' && !supportsVision()}
|
||||
<div class="flex items-start space-x-3">
|
||||
<Checkbox
|
||||
id={field.key}
|
||||
checked={Boolean(localConfig[field.key])}
|
||||
disabled={isDisabled}
|
||||
onCheckedChange={(checked) => onConfigChange(field.key, checked)}
|
||||
class="mt-1"
|
||||
/>
|
||||
|
||||
<div class="space-y-1">
|
||||
<label
|
||||
for={field.key}
|
||||
class="cursor-pointer text-sm leading-none font-medium {isDisabled
|
||||
? 'text-muted-foreground'
|
||||
: ''}"
|
||||
>
|
||||
{field.label}
|
||||
</label>
|
||||
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="text-xs text-muted-foreground">
|
||||
{field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
</p>
|
||||
{:else if field.key === 'pdfAsImage' && !supportsVision()}
|
||||
<p class="text-xs text-muted-foreground">
|
||||
PDF-to-image processing requires a vision-capable model. PDFs will be processed as
|
||||
text.
|
||||
</p>
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
{/each}
|
||||
@@ -2,16 +2,11 @@
|
||||
import { Button } from '$lib/components/ui/button';
|
||||
|
||||
interface Props {
|
||||
onClose?: () => void;
|
||||
onReset?: () => void;
|
||||
onSave?: () => void;
|
||||
}
|
||||
|
||||
let { onClose, onReset, onSave }: Props = $props();
|
||||
|
||||
function handleClose() {
|
||||
onClose?.();
|
||||
}
|
||||
let { onReset, onSave }: Props = $props();
|
||||
|
||||
function handleReset() {
|
||||
onReset?.();
|
||||
@@ -25,9 +20,5 @@
|
||||
<div class="flex justify-between border-t border-border/30 p-6">
|
||||
<Button variant="outline" onclick={handleReset}>Reset to default</Button>
|
||||
|
||||
<div class="flex gap-2">
|
||||
<Button variant="outline" onclick={handleClose}>Close</Button>
|
||||
|
||||
<Button onclick={handleSave}>Save</Button>
|
||||
</div>
|
||||
<Button onclick={handleSave}>Save settings</Button>
|
||||
</div>
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
<script lang="ts">
|
||||
import type { Component, Snippet } from 'svelte';
|
||||
|
||||
interface Props {
|
||||
children: Snippet;
|
||||
title: string;
|
||||
Icon: Component;
|
||||
}
|
||||
|
||||
let { children, title, Icon }: Props = $props();
|
||||
</script>
|
||||
|
||||
<div>
|
||||
<div class="mb-6 flex items-center gap-2 border-b border-border/30 pb-6">
|
||||
<Icon class="h-5 w-5" />
|
||||
|
||||
<h3 class="text-lg font-semibold">{title}</h3>
|
||||
</div>
|
||||
|
||||
<div class="space-y-6">
|
||||
{@render children()}
|
||||
</div>
|
||||
</div>
|
||||
@@ -22,8 +22,8 @@ export { default as ChatScreenHeader } from './chat/ChatScreen/ChatScreenHeader.
|
||||
export { default as ChatScreen } from './chat/ChatScreen/ChatScreen.svelte';
|
||||
|
||||
export { default as ChatSettingsDialog } from './chat/ChatSettings/ChatSettingsDialog.svelte';
|
||||
export { default as ChatSettingsSection } from './chat/ChatSettings/ChatSettingsSection.svelte';
|
||||
export { default as ChatSettingsFooter } from './chat/ChatSettings/ChatSettingsFooter.svelte';
|
||||
export { default as ChatSettingsFields } from './chat/ChatSettings/ChatSettingsFields.svelte';
|
||||
|
||||
export { default as ChatSidebar } from './chat/ChatSidebar/ChatSidebar.svelte';
|
||||
export { default as ChatSidebarConversationItem } from './chat/ChatSidebar/ChatSidebarConversationItem.svelte';
|
||||
|
||||
Reference in New Issue
Block a user