mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-02 07:04:19 +00:00
Compare commits
66 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3f9da22c2b | ||
|
|
2a0dc97e56 | ||
|
|
97a20c012b | ||
|
|
f01bd02376 | ||
|
|
6f3bd38640 | ||
|
|
be0a0f8cae | ||
|
|
92e3006bb6 | ||
|
|
833e2b7409 | ||
|
|
e0e912f49b | ||
|
|
a10b36c91a | ||
|
|
83a88bd6af | ||
|
|
42eb248f46 | ||
|
|
9bacd6b374 | ||
|
|
267c1399f1 | ||
|
|
f423981ac8 | ||
|
|
e39e727e9a | ||
|
|
5936a616e4 | ||
|
|
3fd072a540 | ||
|
|
a6f32f0b34 | ||
|
|
2bb3597e42 | ||
|
|
8293970542 | ||
|
|
8bbf26083d | ||
|
|
35782aeedb | ||
|
|
c80a7759da | ||
|
|
250d7953e8 | ||
|
|
403fbacbbc | ||
|
|
a8a1f33567 | ||
|
|
1790e73157 | ||
|
|
0114a32da0 | ||
|
|
a7724480fd | ||
|
|
1a85949067 | ||
|
|
6c02a032fa | ||
|
|
f52d59d771 | ||
|
|
52de2e5949 | ||
|
|
2c3f8b850a | ||
|
|
4663bd353c | ||
|
|
b3de7cac73 | ||
|
|
7242dd9675 | ||
|
|
492d7f1ff7 | ||
|
|
d3f1f0acfb | ||
|
|
360dc22c00 | ||
|
|
a62d7fa7a9 | ||
|
|
e408d4351a | ||
|
|
3891e183c6 | ||
|
|
af6ae1efb2 | ||
|
|
0bb2919335 | ||
|
|
a69f846351 | ||
|
|
d07a0d7a79 | ||
|
|
3714c3ee1a | ||
|
|
b4ae50810e | ||
|
|
b86f600723 | ||
|
|
dd373dd3bf | ||
|
|
5d01670266 | ||
|
|
ef03229ff4 | ||
|
|
13731766db | ||
|
|
ab6ab8f809 | ||
|
|
2099a9d5db | ||
|
|
2969019837 | ||
|
|
5dec47dcd4 | ||
|
|
f125b8dccf | ||
|
|
953c2a62cf | ||
|
|
d5c6309d91 | ||
|
|
029c693fdc | ||
|
|
771d84371c | ||
|
|
df0665a483 | ||
|
|
0306aad1ca |
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -803,7 +803,7 @@ jobs:
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.4.304.1
|
||||
VULKAN_VERSION: 1.4.309.0
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
|
||||
@@ -112,6 +112,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
|
||||
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
|
||||
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
|
||||
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
|
||||
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
|
||||
|
||||
#### Multimodal
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ docker run --privileged -it \
|
||||
Inside the container, execute the following commands:
|
||||
|
||||
```bash
|
||||
apt update -y && apt install -y bc cmake git python3.10-venv time unzip wget
|
||||
apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
|
||||
git config --global --add safe.directory /ws
|
||||
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
|
||||
```
|
||||
|
||||
@@ -69,7 +69,7 @@ fi
|
||||
if [ ! -z ${GG_BUILD_MUSA} ]; then
|
||||
# Use qy1 by default (MTT S80)
|
||||
MUSA_ARCH=${MUSA_ARCH:-21}
|
||||
CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
|
||||
fi
|
||||
## helpers
|
||||
|
||||
|
||||
717
common/arg.cpp
717
common/arg.cpp
@@ -1,9 +1,20 @@
|
||||
#include "gguf.h" // for reading GGUF splits
|
||||
#include "arg.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
#include "chat.h"
|
||||
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
@@ -14,6 +25,14 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
//#define LLAMA_USE_CURL
|
||||
|
||||
#if defined(LLAMA_USE_CURL)
|
||||
#include <curl/curl.h>
|
||||
#include <curl/easy.h>
|
||||
#include <future>
|
||||
#endif
|
||||
|
||||
#include "json-schema-to-grammar.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
@@ -125,47 +144,549 @@ std::string common_arg::to_string() {
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
//
|
||||
// downloader
|
||||
//
|
||||
|
||||
struct common_hf_file_res {
|
||||
std::string repo; // repo name with ":tag" removed
|
||||
std::string ggufFile;
|
||||
std::string mmprojFile;
|
||||
};
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
#ifdef __linux__
|
||||
#include <linux/limits.h>
|
||||
#elif defined(_WIN32)
|
||||
# if !defined(PATH_MAX)
|
||||
# define PATH_MAX MAX_PATH
|
||||
# endif
|
||||
#else
|
||||
#include <sys/syslimits.h>
|
||||
#endif
|
||||
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||
|
||||
//
|
||||
// CURL utils
|
||||
//
|
||||
|
||||
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||
|
||||
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
||||
struct curl_slist_ptr {
|
||||
struct curl_slist * ptr = nullptr;
|
||||
~curl_slist_ptr() {
|
||||
if (ptr) {
|
||||
curl_slist_free_all(ptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#define CURL_MAX_RETRY 3
|
||||
#define CURL_RETRY_DELAY_SECONDS 2
|
||||
|
||||
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
||||
int remaining_attempts = max_attempts;
|
||||
|
||||
while (remaining_attempts > 0) {
|
||||
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res == CURLE_OK) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
||||
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
||||
|
||||
remaining_attempts--;
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||
}
|
||||
|
||||
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
|
||||
// Initialize libcurl
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
if (!curl) {
|
||||
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool force_download = false;
|
||||
|
||||
// Set the URL, allow to follow http redirection
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
||||
|
||||
// Check if hf-token or bearer-token was specified
|
||||
if (!bearer_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
||||
// operating system. Currently implemented under MS-Windows.
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
|
||||
// Check if the file already exists locally
|
||||
auto file_exists = std::filesystem::exists(path);
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
nlohmann::json metadata;
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
|
||||
if (file_exists) {
|
||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
||||
auto previous_url = metadata.at("url").get<std::string>();
|
||||
if (previous_url != url) {
|
||||
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||
struct common_load_model_from_url_headers {
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
};
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
|
||||
{
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||
|
||||
std::string header(buffer, n_items);
|
||||
std::smatch match;
|
||||
if (std::regex_match(header, match, header_regex)) {
|
||||
const std::string & key = match[1];
|
||||
const std::string & value = match[2];
|
||||
if (std::regex_match(key, match, etag_regex)) {
|
||||
headers->etag = value;
|
||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||
headers->last_modified = value;
|
||||
}
|
||||
}
|
||||
return n_items;
|
||||
};
|
||||
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code != 200) {
|
||||
// HEAD not supported, we don't know if the file has changed
|
||||
// force trigger downloading
|
||||
force_download = true;
|
||||
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
}
|
||||
}
|
||||
|
||||
bool should_download = !file_exists || force_download;
|
||||
if (!should_download) {
|
||||
if (!etag.empty() && etag != headers.etag) {
|
||||
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
||||
should_download = true;
|
||||
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
||||
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
||||
should_download = true;
|
||||
}
|
||||
}
|
||||
if (should_download) {
|
||||
std::string path_temporary = path + ".downloadInProgress";
|
||||
if (file_exists) {
|
||||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Set the output file
|
||||
|
||||
struct FILE_deleter {
|
||||
void operator()(FILE * f) const {
|
||||
fclose(f);
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
||||
if (!outfile) {
|
||||
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
||||
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
||||
return fwrite(data, size, nmemb, (FILE *)fd);
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
||||
|
||||
// display download progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
||||
|
||||
// helper function to hide password in URL
|
||||
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
||||
std::size_t protocol_pos = url.find("://");
|
||||
if (protocol_pos == std::string::npos) {
|
||||
return url; // Malformed URL
|
||||
}
|
||||
|
||||
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
||||
if (at_pos == std::string::npos) {
|
||||
return url; // No password in URL
|
||||
}
|
||||
|
||||
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
||||
};
|
||||
|
||||
// start the download
|
||||
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code < 200 || http_code >= 400) {
|
||||
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Causes file to be closed explicitly here before we rename it.
|
||||
outfile.reset();
|
||||
|
||||
// Write the updated JSON metadata file.
|
||||
metadata.update({
|
||||
{"url", url},
|
||||
{"etag", headers.etag},
|
||||
{"lastModified", headers.last_modified}
|
||||
});
|
||||
std::ofstream(metadata_path) << metadata.dump(4);
|
||||
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||
|
||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// download multiple files from remote URLs to local paths
|
||||
// the input is a vector of pairs <url, path>
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
|
||||
// Prepare download in parallel
|
||||
std::vector<std::future<bool>> futures_download;
|
||||
for (auto const & item : urls) {
|
||||
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token);
|
||||
}, item));
|
||||
}
|
||||
|
||||
// Wait for all downloads to complete
|
||||
for (auto & f : futures_download) {
|
||||
if (!f.get()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool common_download_model(
|
||||
const common_params_model & model,
|
||||
const std::string & bearer_token) {
|
||||
// Basic validation of the model.url
|
||||
if (model.url.empty()) {
|
||||
LOG_ERR("%s: invalid model url\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// check for additional GGUFs split to download
|
||||
int n_split = 0;
|
||||
{
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ NULL,
|
||||
};
|
||||
auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
||||
if (key_n_split >= 0) {
|
||||
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
||||
}
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
}
|
||||
|
||||
if (n_split > 1) {
|
||||
char split_prefix[PATH_MAX] = {0};
|
||||
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
||||
|
||||
// Verify the first split file format
|
||||
// and extract split URL and PATH prefixes
|
||||
{
|
||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> urls;
|
||||
for (int idx = 1; idx < n_split; idx++) {
|
||||
char split_path[PATH_MAX] = {0};
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
||||
|
||||
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
||||
llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
|
||||
|
||||
if (std::string(split_path) == model.path) {
|
||||
continue; // skip the already downloaded file
|
||||
}
|
||||
|
||||
urls.push_back({split_url, split_path});
|
||||
}
|
||||
|
||||
// Download in parallel
|
||||
common_download_file_multiple(urls, bearer_token);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
||||
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
||||
*
|
||||
* Return pair of <repo, file> (with "repo" already having tag removed)
|
||||
*
|
||||
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
||||
*/
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
|
||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||
std::string hf_repo = parts[0];
|
||||
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
||||
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
||||
}
|
||||
|
||||
// fetch model info from Hugging Face Hub API
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
std::string res_str;
|
||||
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
||||
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
||||
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
||||
#if defined(_WIN32)
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
if (!bearer_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
}
|
||||
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
throw std::runtime_error("error: cannot make GET request to HF API");
|
||||
}
|
||||
|
||||
long res_code;
|
||||
std::string ggufFile = "";
|
||||
std::string mmprojFile = "";
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
||||
if (res_code == 200) {
|
||||
// extract ggufFile.rfilename in json, using regex
|
||||
{
|
||||
std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
||||
std::smatch match;
|
||||
if (std::regex_search(res_str, match, pattern)) {
|
||||
ggufFile = match[1].str();
|
||||
}
|
||||
}
|
||||
// extract mmprojFile.rfilename in json, using regex
|
||||
{
|
||||
std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
||||
std::smatch match;
|
||||
if (std::regex_search(res_str, match, pattern)) {
|
||||
mmprojFile = match[1].str();
|
||||
}
|
||||
}
|
||||
} else if (res_code == 401) {
|
||||
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
||||
} else {
|
||||
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
||||
}
|
||||
|
||||
// check response
|
||||
if (ggufFile.empty()) {
|
||||
throw std::runtime_error("error: model does not have ggufFile");
|
||||
}
|
||||
|
||||
return { hf_repo, ggufFile, mmprojFile };
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_model(
|
||||
const common_params_model &,
|
||||
const std::string &) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||
return {};
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
// utils
|
||||
//
|
||||
|
||||
static void common_params_handle_model_default(
|
||||
std::string & model,
|
||||
const std::string & model_url,
|
||||
std::string & hf_repo,
|
||||
std::string & hf_file,
|
||||
const std::string & hf_token,
|
||||
const std::string & model_default) {
|
||||
if (!hf_repo.empty()) {
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
if (hf_file.empty()) {
|
||||
if (model.empty()) {
|
||||
auto auto_detected = common_get_hf_file(hf_repo, hf_token);
|
||||
if (auto_detected.first.empty() || auto_detected.second.empty()) {
|
||||
exit(1); // built without CURL, error message already printed
|
||||
static void common_params_handle_model(
|
||||
struct common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
const std::string & model_path_default,
|
||||
bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
|
||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||
{
|
||||
if (!model.hf_repo.empty()) {
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
if (model.hf_file.empty()) {
|
||||
if (model.path.empty()) {
|
||||
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
|
||||
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
||||
exit(1); // built without CURL, error message already printed
|
||||
}
|
||||
model.hf_repo = auto_detected.repo;
|
||||
model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
|
||||
} else {
|
||||
model.hf_file = model.path;
|
||||
}
|
||||
hf_repo = auto_detected.first;
|
||||
hf_file = auto_detected.second;
|
||||
} else {
|
||||
hf_file = model;
|
||||
}
|
||||
|
||||
// TODO: allow custom host
|
||||
model.url = "https://huggingface.co/" + model.hf_repo + "/resolve/main/" + model.hf_file;
|
||||
|
||||
// make sure model path is present (for caching purposes)
|
||||
if (model.path.empty()) {
|
||||
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||
std::string filename = model.hf_repo + "_" + model.hf_file;
|
||||
// to make sure we don't have any slashes in the filename
|
||||
string_replace_all(filename, "/", "_");
|
||||
model.path = fs_get_cache_file(filename);
|
||||
}
|
||||
|
||||
} else if (!model.url.empty()) {
|
||||
if (model.path.empty()) {
|
||||
auto f = string_split<std::string>(model.url, '#').front();
|
||||
f = string_split<std::string>(f, '?').front();
|
||||
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
||||
}
|
||||
|
||||
} else if (model.path.empty()) {
|
||||
model.path = model_path_default;
|
||||
}
|
||||
// make sure model path is present (for caching purposes)
|
||||
if (model.empty()) {
|
||||
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||
std::string filename = hf_repo + "_" + hf_file;
|
||||
// to make sure we don't have any slashes in the filename
|
||||
string_replace_all(filename, "/", "_");
|
||||
model = fs_get_cache_file(filename);
|
||||
}
|
||||
|
||||
// then, download it if needed
|
||||
if (!model.url.empty()) {
|
||||
bool ok = common_download_model(model, bearer_token);
|
||||
if (!ok) {
|
||||
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
||||
exit(1);
|
||||
}
|
||||
} else if (!model_url.empty()) {
|
||||
if (model.empty()) {
|
||||
auto f = string_split<std::string>(model_url, '#').front();
|
||||
f = string_split<std::string>(f, '?').front();
|
||||
model = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
||||
}
|
||||
} else if (model.empty()) {
|
||||
model = model_default;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -300,10 +821,16 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||
}
|
||||
|
||||
// TODO: refactor model params in a common struct
|
||||
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH);
|
||||
common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
|
||||
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, "");
|
||||
common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
||||
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
||||
|
||||
// allow --mmproj to be set from -hf
|
||||
// assuming that mmproj is always in the same repo as text model
|
||||
if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
|
||||
params.mmproj.hf_repo = params.model.hf_repo;
|
||||
}
|
||||
common_params_handle_model(params.mmproj, params.hf_token, "", true);
|
||||
|
||||
if (params.escape) {
|
||||
string_process_escapes(params.prompt);
|
||||
@@ -322,6 +849,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
if (!params.tensor_buft_overrides.empty()) {
|
||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (params.reranking && params.embedding) {
|
||||
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
||||
}
|
||||
@@ -1561,7 +2092,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--mmproj"}, "FILE",
|
||||
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj = value;
|
||||
params.mmproj.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-url"}, "URL",
|
||||
"URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.url = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
||||
add_opt(common_arg(
|
||||
@@ -1647,6 +2185,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
exit(0);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
|
||||
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
||||
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
||||
if (buft_list.empty()) {
|
||||
// enumerate all the devices and add their buffer types to the list
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
auto * buft = ggml_backend_dev_buffer_type(dev);
|
||||
if (buft) {
|
||||
buft_list[ggml_backend_buft_name(buft)] = buft;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & override : string_split<std::string>(value, ',')) {
|
||||
std::string::size_type pos = override.find('=');
|
||||
if (pos == std::string::npos) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
std::string tensor_name = override.substr(0, pos);
|
||||
std::string buffer_type = override.substr(pos + 1);
|
||||
|
||||
if (buft_list.find(buffer_type) == buft_list.end()) {
|
||||
printf("Available buffer types:\n");
|
||||
for (const auto & it : buft_list) {
|
||||
printf(" %s\n", ggml_backend_buft_name(it.second));
|
||||
}
|
||||
throw std::invalid_argument("unknown buffer type");
|
||||
}
|
||||
// FIXME: this leaks memory
|
||||
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
"number of layers to store in VRAM",
|
||||
@@ -1790,14 +2363,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
||||
),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model = value;
|
||||
params.model.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
||||
add_opt(common_arg(
|
||||
{"-mu", "--model-url"}, "MODEL_URL",
|
||||
"model download url (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model_url = value;
|
||||
params.model.url = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_MODEL_URL"));
|
||||
add_opt(common_arg(
|
||||
@@ -1806,35 +2379,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"example: unsloth/phi-4-GGUF:q4_k_m\n"
|
||||
"(default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.hf_repo = value;
|
||||
params.model.hf_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HF_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
|
||||
"Same as --hf-repo, but for the draft model (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.hf_repo = value;
|
||||
params.speculative.model.hf_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HFD_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hff", "--hf-file"}, "FILE",
|
||||
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.hf_file = value;
|
||||
params.model.hf_file = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HF_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
|
||||
"Hugging Face model repository for the vocoder model (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.vocoder.hf_repo = value;
|
||||
params.vocoder.model.hf_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HF_REPO_V"));
|
||||
add_opt(common_arg(
|
||||
{"-hffv", "--hf-file-v"}, "FILE",
|
||||
"Hugging Face model file for the vocoder model (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.vocoder.hf_file = value;
|
||||
params.vocoder.model.hf_file = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HF_FILE_V"));
|
||||
add_opt(common_arg(
|
||||
@@ -1979,7 +2552,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
||||
add_opt(common_arg(
|
||||
{"--host"}, "HOST",
|
||||
string_format("ip address to listen (default: %s)", params.hostname.c_str()),
|
||||
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.hostname = value;
|
||||
}
|
||||
@@ -2454,7 +3027,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"-md", "--model-draft"}, "FNAME",
|
||||
"draft model for speculative decoding (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.model = value;
|
||||
params.speculative.model.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
||||
|
||||
@@ -2462,7 +3035,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"-mv", "--model-vocoder"}, "FNAME",
|
||||
"vocoder model for audio generation (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.vocoder.model = value;
|
||||
params.vocoder.model.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
@@ -2485,10 +3058,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--tts-oute-default"},
|
||||
string_format("use default OuteTTS models (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
|
||||
params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
|
||||
params.vocoder.hf_repo = "ggml-org/WavTokenizer";
|
||||
params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
|
||||
params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
|
||||
params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
|
||||
params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
|
||||
params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_TTS}));
|
||||
|
||||
@@ -2496,8 +3069,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--embd-bge-small-en-default"},
|
||||
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
||||
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
||||
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
||||
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
||||
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
params.embd_normalize = 2;
|
||||
params.n_ctx = 512;
|
||||
@@ -2510,8 +3083,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--embd-e5-small-en-default"},
|
||||
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
||||
params.hf_file = "e5-small-v2-q8_0.gguf";
|
||||
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
||||
params.model.hf_file = "e5-small-v2-q8_0.gguf";
|
||||
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
params.embd_normalize = 2;
|
||||
params.n_ctx = 512;
|
||||
@@ -2524,8 +3097,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--embd-gte-small-default"},
|
||||
string_format("use default gte-small model (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
||||
params.hf_file = "gte-small-q8_0.gguf";
|
||||
params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
||||
params.model.hf_file = "gte-small-q8_0.gguf";
|
||||
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
params.embd_normalize = 2;
|
||||
params.n_ctx = 512;
|
||||
@@ -2538,8 +3111,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--fim-qwen-1.5b-default"},
|
||||
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
||||
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
||||
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
||||
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
||||
params.port = 8012;
|
||||
params.n_gpu_layers = 99;
|
||||
params.flash_attn = true;
|
||||
@@ -2554,8 +3127,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--fim-qwen-3b-default"},
|
||||
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
||||
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
||||
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
||||
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
||||
params.port = 8012;
|
||||
params.n_gpu_layers = 99;
|
||||
params.flash_attn = true;
|
||||
@@ -2570,8 +3143,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--fim-qwen-7b-default"},
|
||||
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
||||
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
||||
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
||||
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
||||
params.port = 8012;
|
||||
params.n_gpu_layers = 99;
|
||||
params.flash_attn = true;
|
||||
@@ -2586,10 +3159,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--fim-qwen-7b-spec"},
|
||||
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
||||
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
||||
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
||||
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
||||
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
||||
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
||||
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
||||
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
||||
params.speculative.n_gpu_layers = 99;
|
||||
params.port = 8012;
|
||||
params.n_gpu_layers = 99;
|
||||
@@ -2605,10 +3178,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--fim-qwen-14b-spec"},
|
||||
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
||||
[](common_params & params) {
|
||||
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
||||
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
||||
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
||||
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
||||
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
||||
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
||||
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
||||
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
||||
params.speculative.n_gpu_layers = 99;
|
||||
params.port = 8012;
|
||||
params.n_gpu_layers = 99;
|
||||
|
||||
@@ -7,9 +7,6 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -51,47 +48,11 @@
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#if defined(LLAMA_USE_CURL)
|
||||
#include <curl/curl.h>
|
||||
#include <curl/easy.h>
|
||||
#include <future>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if defined(LLAMA_USE_CURL)
|
||||
#ifdef __linux__
|
||||
#include <linux/limits.h>
|
||||
#elif defined(_WIN32)
|
||||
# if !defined(PATH_MAX)
|
||||
# define PATH_MAX MAX_PATH
|
||||
# endif
|
||||
#else
|
||||
#include <sys/syslimits.h>
|
||||
#endif
|
||||
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||
|
||||
//
|
||||
// CURL utils
|
||||
//
|
||||
|
||||
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||
|
||||
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
||||
struct curl_slist_ptr {
|
||||
struct curl_slist * ptr = nullptr;
|
||||
~curl_slist_ptr() {
|
||||
if (ptr) {
|
||||
curl_slist_free_all(ptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
//
|
||||
// CPU utils
|
||||
//
|
||||
@@ -900,22 +861,14 @@ std::string fs_get_cache_file(const std::string & filename) {
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params) {
|
||||
common_init_result iparams;
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
|
||||
llama_model * model = nullptr;
|
||||
|
||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
||||
} else if (!params.model_url.empty()) {
|
||||
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||
} else {
|
||||
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
||||
return iparams;
|
||||
}
|
||||
|
||||
@@ -950,7 +903,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||
llama_model_free(model);
|
||||
return iparams;
|
||||
}
|
||||
@@ -1089,15 +1042,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
if (!params.devices.empty()) {
|
||||
mparams.devices = params.devices.data();
|
||||
}
|
||||
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
@@ -1105,6 +1061,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
mparams.kv_overrides = params.kv_overrides.data();
|
||||
}
|
||||
|
||||
if (params.tensor_buft_overrides.empty()) {
|
||||
mparams.tensor_buft_overrides = NULL;
|
||||
} else {
|
||||
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
|
||||
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
|
||||
}
|
||||
|
||||
return mparams;
|
||||
}
|
||||
|
||||
@@ -1164,451 +1127,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||
return tpp;
|
||||
}
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
#define CURL_MAX_RETRY 3
|
||||
#define CURL_RETRY_DELAY_SECONDS 2
|
||||
|
||||
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
||||
int remaining_attempts = max_attempts;
|
||||
|
||||
while (remaining_attempts > 0) {
|
||||
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res == CURLE_OK) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
||||
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
||||
|
||||
remaining_attempts--;
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||
}
|
||||
|
||||
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
||||
// Initialize libcurl
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
if (!curl) {
|
||||
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool force_download = false;
|
||||
|
||||
// Set the URL, allow to follow http redirection
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
||||
|
||||
// Check if hf-token or bearer-token was specified
|
||||
if (!hf_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + hf_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
||||
// operating system. Currently implemented under MS-Windows.
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
|
||||
// Check if the file already exists locally
|
||||
auto file_exists = std::filesystem::exists(path);
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
nlohmann::json metadata;
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
|
||||
if (file_exists) {
|
||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
||||
auto previous_url = metadata.at("url").get<std::string>();
|
||||
if (previous_url != url) {
|
||||
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||
struct common_load_model_from_url_headers {
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
};
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
|
||||
{
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||
|
||||
std::string header(buffer, n_items);
|
||||
std::smatch match;
|
||||
if (std::regex_match(header, match, header_regex)) {
|
||||
const std::string & key = match[1];
|
||||
const std::string & value = match[2];
|
||||
if (std::regex_match(key, match, etag_regex)) {
|
||||
headers->etag = value;
|
||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||
headers->last_modified = value;
|
||||
}
|
||||
}
|
||||
return n_items;
|
||||
};
|
||||
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code != 200) {
|
||||
// HEAD not supported, we don't know if the file has changed
|
||||
// force trigger downloading
|
||||
force_download = true;
|
||||
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
}
|
||||
}
|
||||
|
||||
bool should_download = !file_exists || force_download;
|
||||
if (!should_download) {
|
||||
if (!etag.empty() && etag != headers.etag) {
|
||||
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
||||
should_download = true;
|
||||
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
||||
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
||||
should_download = true;
|
||||
}
|
||||
}
|
||||
if (should_download) {
|
||||
std::string path_temporary = path + ".downloadInProgress";
|
||||
if (file_exists) {
|
||||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Set the output file
|
||||
|
||||
struct FILE_deleter {
|
||||
void operator()(FILE * f) const {
|
||||
fclose(f);
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
||||
if (!outfile) {
|
||||
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
||||
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
||||
return fwrite(data, size, nmemb, (FILE *)fd);
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
||||
|
||||
// display download progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
||||
|
||||
// helper function to hide password in URL
|
||||
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
||||
std::size_t protocol_pos = url.find("://");
|
||||
if (protocol_pos == std::string::npos) {
|
||||
return url; // Malformed URL
|
||||
}
|
||||
|
||||
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
||||
if (at_pos == std::string::npos) {
|
||||
return url; // No password in URL
|
||||
}
|
||||
|
||||
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
||||
};
|
||||
|
||||
// start the download
|
||||
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code < 200 || http_code >= 400) {
|
||||
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Causes file to be closed explicitly here before we rename it.
|
||||
outfile.reset();
|
||||
|
||||
// Write the updated JSON metadata file.
|
||||
metadata.update({
|
||||
{"url", url},
|
||||
{"etag", headers.etag},
|
||||
{"lastModified", headers.last_modified}
|
||||
});
|
||||
std::ofstream(metadata_path) << metadata.dump(4);
|
||||
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||
|
||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const std::string & model_url,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params) {
|
||||
// Basic validation of the model_url
|
||||
if (model_url.empty()) {
|
||||
LOG_ERR("%s: invalid model_url\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!common_download_file(model_url, local_path, hf_token)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// check for additional GGUFs split to download
|
||||
int n_split = 0;
|
||||
{
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ NULL,
|
||||
};
|
||||
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
||||
if (key_n_split >= 0) {
|
||||
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
||||
}
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
}
|
||||
|
||||
if (n_split > 1) {
|
||||
char split_prefix[PATH_MAX] = {0};
|
||||
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
||||
|
||||
// Verify the first split file format
|
||||
// and extract split URL and PATH prefixes
|
||||
{
|
||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare download in parallel
|
||||
std::vector<std::future<bool>> futures_download;
|
||||
for (int idx = 1; idx < n_split; idx++) {
|
||||
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
|
||||
char split_path[PATH_MAX] = {0};
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
|
||||
|
||||
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
||||
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
||||
|
||||
return common_download_file(split_url, split_path, hf_token);
|
||||
}, idx));
|
||||
}
|
||||
|
||||
// Wait for all downloads to complete
|
||||
for (auto & f : futures_download) {
|
||||
if (!f.get()) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return llama_model_load_from_file(local_path.c_str(), params);
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const std::string & repo,
|
||||
const std::string & remote_path,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params) {
|
||||
// construct hugging face model url:
|
||||
//
|
||||
// --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
|
||||
// https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
|
||||
//
|
||||
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
|
||||
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
|
||||
//
|
||||
|
||||
std::string model_url = "https://huggingface.co/";
|
||||
model_url += repo;
|
||||
model_url += "/resolve/main/";
|
||||
model_url += remote_path;
|
||||
|
||||
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
||||
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
||||
*
|
||||
* Return pair of <repo, file> (with "repo" already having tag removed)
|
||||
*
|
||||
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
||||
*/
|
||||
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
|
||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||
std::string hf_repo = parts[0];
|
||||
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
||||
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
||||
}
|
||||
|
||||
// fetch model info from Hugging Face Hub API
|
||||
json model_info;
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
std::string res_str;
|
||||
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
||||
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
||||
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
||||
#if defined(_WIN32)
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
if (!hf_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + hf_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
}
|
||||
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
throw std::runtime_error("error: cannot make GET request to HF API");
|
||||
}
|
||||
|
||||
long res_code;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
||||
if (res_code == 200) {
|
||||
model_info = json::parse(res_str);
|
||||
} else if (res_code == 401) {
|
||||
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
||||
} else {
|
||||
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
||||
}
|
||||
|
||||
// check response
|
||||
if (!model_info.contains("ggufFile")) {
|
||||
throw std::runtime_error("error: model does not have ggufFile");
|
||||
}
|
||||
json & gguf_file = model_info.at("ggufFile");
|
||||
if (!gguf_file.contains("rfilename")) {
|
||||
throw std::runtime_error("error: ggufFile does not have rfilename");
|
||||
}
|
||||
|
||||
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const std::string & /*model_url*/,
|
||||
const std::string & /*local_path*/,
|
||||
const std::string & /*hf_token*/,
|
||||
const struct llama_model_params & /*params*/) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const std::string & /*repo*/,
|
||||
const std::string & /*remote_path*/,
|
||||
const std::string & /*local_path*/,
|
||||
const std::string & /*hf_token*/,
|
||||
const struct llama_model_params & /*params*/) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||
return std::make_pair("", "");
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
// Batch utils
|
||||
//
|
||||
@@ -2032,26 +1550,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <>
|
||||
json common_grammar_trigger::to_json() const {
|
||||
json out {
|
||||
{"type", (int) type},
|
||||
{"value", value},
|
||||
};
|
||||
if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
out["token"] = (int) token;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <>
|
||||
common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
|
||||
common_grammar_trigger out;
|
||||
out.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
||||
out.value = in.at("value").get<std::string>();
|
||||
if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
out.token = (llama_token) in.at("token").get<int>();
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
@@ -121,10 +121,6 @@ struct common_grammar_trigger {
|
||||
common_grammar_trigger_type type;
|
||||
std::string value;
|
||||
llama_token token = LLAMA_TOKEN_NULL;
|
||||
|
||||
// T can only be nlohmann::ordered_json
|
||||
template <class T> T to_json() const;
|
||||
template <class T> static common_grammar_trigger from_json(const T & in);
|
||||
};
|
||||
|
||||
// sampling parameters
|
||||
@@ -184,6 +180,13 @@ struct common_params_sampling {
|
||||
std::string print() const;
|
||||
};
|
||||
|
||||
struct common_params_model {
|
||||
std::string path = ""; // model local path // NOLINT
|
||||
std::string url = ""; // model url to download // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
};
|
||||
|
||||
struct common_params_speculative {
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
@@ -197,19 +200,11 @@ struct common_params_speculative {
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
|
||||
std::string model = ""; // draft model for speculative decoding // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
struct common_params_model model;
|
||||
};
|
||||
|
||||
struct common_params_vocoder {
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
|
||||
std::string model = ""; // model path // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
struct common_params_model model;
|
||||
|
||||
std::string speaker_file = ""; // speaker file path // NOLINT
|
||||
|
||||
@@ -267,12 +262,10 @@ struct common_params {
|
||||
struct common_params_speculative speculative;
|
||||
struct common_params_vocoder vocoder;
|
||||
|
||||
std::string model = ""; // model path // NOLINT
|
||||
struct common_params_model model;
|
||||
|
||||
std::string model_alias = ""; // model alias // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
std::string hf_token = ""; // HF token // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string prompt = ""; // NOLINT
|
||||
std::string system_prompt = ""; // NOLINT
|
||||
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||
@@ -286,6 +279,7 @@ struct common_params {
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
|
||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
||||
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
||||
@@ -347,7 +341,7 @@ struct common_params {
|
||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||
struct common_params_model mmproj;
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// embedding
|
||||
@@ -546,23 +540,6 @@ struct llama_model_params common_model_params_to_llama ( common_params
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const std::string & model_url,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const std::string & repo,
|
||||
const std::string & remote_path,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
|
||||
std::pair<std::string, std::string> common_get_hf_file(
|
||||
const std::string & hf_repo_with_tag,
|
||||
const std::string & hf_token);
|
||||
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
||||
|
||||
|
||||
@@ -240,7 +240,7 @@ public:
|
||||
auto index = key.get<int>();
|
||||
return array_->at(index < 0 ? array_->size() + index : index);
|
||||
} else if (object_) {
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
auto it = object_->find(key.primitive_);
|
||||
if (it == object_->end()) return Value();
|
||||
return it->second;
|
||||
@@ -249,7 +249,7 @@ public:
|
||||
}
|
||||
void set(const Value& key, const Value& value) {
|
||||
if (!object_) throw std::runtime_error("Value is not an object: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
(*object_)[key.primitive_] = value;
|
||||
}
|
||||
Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
|
||||
|
||||
@@ -208,6 +208,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size())
|
||||
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
||||
if (!grmr) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto * result = new common_sampler {
|
||||
|
||||
@@ -708,6 +708,12 @@ class Model:
|
||||
if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
|
||||
# ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
|
||||
res = "superbpe"
|
||||
if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
|
||||
# ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
|
||||
res = "trillion"
|
||||
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
|
||||
# ref: https://huggingface.co/inclusionAI/Ling-lite
|
||||
res = "bailingmoe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2269,7 +2275,7 @@ class Qwen2Model(Model):
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
|
||||
|
||||
@Model.register("Qwen2VLForConditionalGeneration")
|
||||
@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
class Qwen2VLModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
||||
|
||||
@@ -3551,8 +3557,8 @@ class RWKV6Qwen2Model(Rwkv6Model):
|
||||
head_size = hidden_size // num_attention_heads
|
||||
rms_norm_eps = self.hparams["rms_norm_eps"]
|
||||
intermediate_size = self.hparams["intermediate_size"]
|
||||
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
|
||||
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
|
||||
time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
|
||||
time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
|
||||
|
||||
# RWKV isn't context limited
|
||||
self.gguf_writer.add_context_length(1048576)
|
||||
@@ -4419,6 +4425,29 @@ class DeepseekV2Model(Model):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("PLMForCausalLM")
|
||||
class PLMModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.PLM
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
|
||||
@Model.register("T5WithLMHeadModel")
|
||||
@Model.register("T5ForConditionalGeneration")
|
||||
@Model.register("MT5ForConditionalGeneration")
|
||||
@@ -5107,6 +5136,105 @@ class GraniteMoeModel(GraniteModel):
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@Model.register("BailingMoeForCausalLM")
|
||||
class BailingMoeModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.BAILINGMOE
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||
self.gguf_writer.add_expert_weights_scale(1.0)
|
||||
self.gguf_writer.add_expert_count(hparams["num_experts"])
|
||||
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
|
||||
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
n_embd = self.hparams["hidden_size"]
|
||||
head_dim = self.hparams.get("head_dim") or n_embd // n_head
|
||||
|
||||
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
||||
|
||||
if name.endswith("attention.dense.weight"):
|
||||
return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
|
||||
elif name.endswith("query_key_value.weight"):
|
||||
q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
|
||||
|
||||
return [
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
|
||||
]
|
||||
elif name.find("mlp.experts") != -1:
|
||||
n_experts = self.hparams["num_experts"]
|
||||
assert bid is not None
|
||||
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
|
||||
return tensors
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
if new_name == output_name and self.hparams.get("norm_head"):
|
||||
data_torch = data_torch.float()
|
||||
data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
|
||||
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("ChameleonForConditionalGeneration")
|
||||
@Model.register("ChameleonForCausalLM") # obsolete
|
||||
class ChameleonModel(Model):
|
||||
|
||||
@@ -111,6 +111,8 @@ models = [
|
||||
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
|
||||
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
|
||||
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
|
||||
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
|
||||
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
||||
|
||||
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
|
||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
|
||||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||
|
||||
@@ -227,16 +227,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|
||||
|
||||
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
|
||||
|
||||
|
||||
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
|
||||
|
||||
```sh
|
||||
git clone https://github.com/oneapi-src/oneMKL
|
||||
cd oneMKL
|
||||
cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
|
||||
cmake --build buildWithCublas --config Release
|
||||
```
|
||||
|
||||
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
|
||||
|
||||
```sh
|
||||
@@ -250,16 +240,6 @@ cmake --build build-nvidia --config Release
|
||||
|
||||
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
|
||||
|
||||
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
|
||||
|
||||
```sh
|
||||
git clone https://github.com/oneapi-src/oneMKL
|
||||
cd oneMKL
|
||||
# Find your HIPTARGET with rocminfo, under the key 'Name:'
|
||||
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
|
||||
cmake --build buildWithrocBLAS --config Release
|
||||
```
|
||||
|
||||
3. **Verify installation and environment**
|
||||
|
||||
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
|
||||
@@ -324,13 +304,10 @@ cmake --build build --config Release -j -v
|
||||
|
||||
#### Nvidia GPU
|
||||
|
||||
```sh
|
||||
# Export relevant ENV variables
|
||||
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
|
||||
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
|
||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
|
||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
|
||||
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
|
||||
|
||||
```sh
|
||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
|
||||
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
|
||||
@@ -347,12 +324,10 @@ cmake --build build --config Release -j -v
|
||||
|
||||
#### AMD GPU
|
||||
|
||||
```sh
|
||||
# Export relevant ENV variables
|
||||
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
|
||||
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
|
||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
|
||||
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
|
||||
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
|
||||
|
||||
```sh
|
||||
# Build LLAMA with rocBLAS acceleration through SYCL
|
||||
|
||||
## AMD
|
||||
|
||||
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
|
||||
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
||||
|
||||
@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
g_verbose = (params.verbosity > 1);
|
||||
try {
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
|
||||
lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
|
||||
ctx.run_merge();
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s\n", err.what());
|
||||
|
||||
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||
|
||||
// create generation context
|
||||
llama_context * ctx = llama_init_from_model(model, cparams);
|
||||
|
||||
@@ -4,6 +4,26 @@
|
||||
>
|
||||
> This is very experimental, only used for demo purpose.
|
||||
|
||||
## Quick started
|
||||
|
||||
You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
|
||||
|
||||
```bash
|
||||
# build
|
||||
cmake -B build
|
||||
cmake --build build --target llama-gemma3-cli
|
||||
|
||||
# alternatively, install from brew (MacOS)
|
||||
brew install llama.cpp
|
||||
|
||||
# run it
|
||||
llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
|
||||
llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
|
||||
|
||||
# note: 1B model does not support vision
|
||||
```
|
||||
|
||||
## How to get mmproj.gguf?
|
||||
|
||||
```bash
|
||||
|
||||
@@ -1396,14 +1396,16 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
const int ftype = get_u32(ctx, KEY_FTYPE);
|
||||
const std::string ftype_str = get_ftype(ftype);
|
||||
const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
|
||||
const std::string description = gguf_get_val_str(ctx, idx_desc);
|
||||
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
||||
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
||||
const std::string name = gguf_get_val_str(ctx, idx_name);
|
||||
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
||||
}
|
||||
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
||||
const int idx_desc = gguf_find_key(ctx, KEY_DESCRIPTION);
|
||||
if (idx_desc != -1) { // ditto
|
||||
const std::string description = gguf_get_val_str(ctx, idx_desc);
|
||||
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
||||
}
|
||||
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
||||
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||
|
||||
@@ -78,7 +78,7 @@ struct gemma3_context {
|
||||
}
|
||||
|
||||
void init_clip_model(common_params & params) {
|
||||
const char * clip_path = params.mmproj.c_str();
|
||||
const char * clip_path = params.mmproj.path.c_str();
|
||||
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
|
||||
}
|
||||
|
||||
@@ -232,13 +232,13 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty()) {
|
||||
if (params.mmproj.path.empty()) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
gemma3_context ctx(params);
|
||||
printf("%s: %s\n", __func__, params.model.c_str());
|
||||
printf("%s: %s\n", __func__, params.model.path.c_str());
|
||||
|
||||
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
@@ -234,7 +234,7 @@ static struct llama_model * llava_init(common_params * params) {
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
const char * clip_path = params->mmproj.c_str();
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
@@ -283,7 +283,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
@@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
||||
}
|
||||
|
||||
static struct clip_ctx * clip_init_context(common_params * params) {
|
||||
const char * clip_path = params->mmproj.c_str();
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty())) {
|
||||
if (params.mmproj.path.empty() || (params.image.empty())) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -314,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
@@ -323,7 +323,7 @@ static struct llama_model * llava_init(common_params * params) {
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
const char * clip_path = params->mmproj.c_str();
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
@@ -524,7 +524,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -106,6 +106,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_params params;
|
||||
|
||||
params.n_predict = 128;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||
return 1;
|
||||
}
|
||||
@@ -405,7 +407,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt_file = "used built-in defaults";
|
||||
}
|
||||
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());
|
||||
|
||||
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||
|
||||
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
|
||||
@@ -1,2 +1,4 @@
|
||||
add_executable(rpc-server rpc-server.cpp)
|
||||
target_link_libraries(rpc-server PRIVATE ggml llama)
|
||||
set(TARGET rpc-server)
|
||||
add_executable(${TARGET} rpc-server.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE ggml)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -72,3 +72,14 @@ $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name
|
||||
|
||||
This way you can offload model layers to both local and remote devices.
|
||||
|
||||
### Local cache
|
||||
|
||||
The RPC server can use a local cache to store large tensors and avoid transferring them over the network.
|
||||
This can speed up model loading significantly, especially when using large models.
|
||||
To enable the cache, use the `-c` option:
|
||||
|
||||
```bash
|
||||
$ bin/rpc-server -c
|
||||
```
|
||||
|
||||
By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
#if defined(_MSC_VER)
|
||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
@@ -18,26 +22,142 @@
|
||||
|
||||
#include "ggml-rpc.h"
|
||||
#ifdef _WIN32
|
||||
# define DIRECTORY_SEPARATOR '\\'
|
||||
# include <locale>
|
||||
# include <windows.h>
|
||||
# include <fcntl.h>
|
||||
# include <io.h>
|
||||
#else
|
||||
# define DIRECTORY_SEPARATOR '/'
|
||||
# include <unistd.h>
|
||||
# include <sys/stat.h>
|
||||
#endif
|
||||
#include <codecvt>
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
#include <filesystem>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
// NOTE: this is copied from common.cpp to avoid linking with libcommon
|
||||
// returns true if successful, false otherwise
|
||||
static bool fs_create_directory_with_parents(const std::string & path) {
|
||||
#ifdef _WIN32
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
std::wstring wpath = converter.from_bytes(path);
|
||||
|
||||
// if the path already exists, check whether it's a directory
|
||||
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
||||
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t pos_slash = 0;
|
||||
|
||||
// process path from front to back, procedurally creating directories
|
||||
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
||||
const std::wstring subpath = wpath.substr(0, pos_slash);
|
||||
const wchar_t * test = subpath.c_str();
|
||||
|
||||
const bool success = CreateDirectoryW(test, NULL);
|
||||
if (!success) {
|
||||
const DWORD error = GetLastError();
|
||||
|
||||
// if the path already exists, ensure that it's a directory
|
||||
if (error == ERROR_ALREADY_EXISTS) {
|
||||
const DWORD attributes = GetFileAttributesW(subpath.c_str());
|
||||
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
pos_slash += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
#else
|
||||
// if the path already exists, check whether it's a directory
|
||||
struct stat info;
|
||||
if (stat(path.c_str(), &info) == 0) {
|
||||
return S_ISDIR(info.st_mode);
|
||||
}
|
||||
|
||||
size_t pos_slash = 1; // skip leading slashes for directory creation
|
||||
|
||||
// process path from front to back, procedurally creating directories
|
||||
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
|
||||
const std::string subpath = path.substr(0, pos_slash);
|
||||
struct stat info;
|
||||
|
||||
// if the path already exists, ensure that it's a directory
|
||||
if (stat(subpath.c_str(), &info) == 0) {
|
||||
if (!S_ISDIR(info.st_mode)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// create parent directories
|
||||
const int ret = mkdir(subpath.c_str(), 0755);
|
||||
if (ret != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
pos_slash += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
// NOTE: this is copied from common.cpp to avoid linking with libcommon
|
||||
static std::string fs_get_cache_directory() {
|
||||
std::string cache_directory = "";
|
||||
auto ensure_trailing_slash = [](std::string p) {
|
||||
// Make sure to add trailing slash
|
||||
if (p.back() != DIRECTORY_SEPARATOR) {
|
||||
p += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
return p;
|
||||
};
|
||||
if (getenv("LLAMA_CACHE")) {
|
||||
cache_directory = std::getenv("LLAMA_CACHE");
|
||||
} else {
|
||||
#ifdef __linux__
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
cache_directory = std::getenv("XDG_CACHE_HOME");
|
||||
} else {
|
||||
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
||||
}
|
||||
#elif defined(__APPLE__)
|
||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
#elif defined(_WIN32)
|
||||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#endif // __linux__
|
||||
cache_directory = ensure_trailing_slash(cache_directory);
|
||||
cache_directory += "llama.cpp";
|
||||
}
|
||||
return ensure_trailing_slash(cache_directory);
|
||||
}
|
||||
|
||||
struct rpc_server_params {
|
||||
std::string host = "127.0.0.1";
|
||||
int port = 50052;
|
||||
size_t backend_mem = 0;
|
||||
bool use_cache = false;
|
||||
};
|
||||
|
||||
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
|
||||
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
|
||||
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
|
||||
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
|
||||
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
|
||||
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
|
||||
fprintf(stderr, " -c, --cache enable local file cache\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
@@ -58,6 +178,8 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
|
||||
if (params.port <= 0 || params.port > 65535) {
|
||||
return false;
|
||||
}
|
||||
} else if (arg == "-c" || arg == "--cache") {
|
||||
params.use_cache = true;
|
||||
} else if (arg == "-m" || arg == "--mem") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
@@ -164,8 +286,20 @@ int main(int argc, char * argv[]) {
|
||||
} else {
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
}
|
||||
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
||||
ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||
const char * cache_dir = nullptr;
|
||||
std::string cache_dir_str = fs_get_cache_directory() + "rpc/";
|
||||
if (params.use_cache) {
|
||||
if (!fs_create_directory_with_parents(cache_dir_str)) {
|
||||
fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
|
||||
return 1;
|
||||
}
|
||||
cache_dir = cache_dir_str.c_str();
|
||||
}
|
||||
printf("Starting RPC server\n");
|
||||
printf(" endpoint : %s\n", endpoint.c_str());
|
||||
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
|
||||
printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
|
||||
ggml_backend_rpc_start_server(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
|
||||
ggml_backend_free(backend);
|
||||
return 0;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -133,7 +133,8 @@ struct slot_params {
|
||||
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : sampling.grammar_triggers) {
|
||||
grammar_triggers.push_back(trigger.to_json<json>());
|
||||
server_grammar_trigger ct(std::move(trigger));
|
||||
grammar_triggers.push_back(ct.to_json());
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -372,9 +373,9 @@ struct server_task {
|
||||
const auto grammar_triggers = data.find("grammar_triggers");
|
||||
if (grammar_triggers != data.end()) {
|
||||
for (const auto & t : *grammar_triggers) {
|
||||
auto ct = common_grammar_trigger::from_json(t);
|
||||
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
||||
const auto & word = ct.value;
|
||||
server_grammar_trigger ct(t);
|
||||
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
||||
const auto & word = ct.value.value;
|
||||
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
||||
if (ids.size() == 1) {
|
||||
auto token = ids[0];
|
||||
@@ -392,7 +393,7 @@ struct server_task {
|
||||
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
||||
}
|
||||
} else {
|
||||
params.sampling.grammar_triggers.push_back(ct);
|
||||
params.sampling.grammar_triggers.push_back(std::move(ct.value));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -489,8 +490,12 @@ struct result_timings {
|
||||
double predicted_per_token_ms;
|
||||
double predicted_per_second;
|
||||
|
||||
// Optional speculative metrics - only included when > 0
|
||||
int32_t draft_n = 0;
|
||||
int32_t draft_n_accepted = 0;
|
||||
|
||||
json to_json() const {
|
||||
return {
|
||||
json base = {
|
||||
{"prompt_n", prompt_n},
|
||||
{"prompt_ms", prompt_ms},
|
||||
{"prompt_per_token_ms", prompt_per_token_ms},
|
||||
@@ -501,6 +506,13 @@ struct result_timings {
|
||||
{"predicted_per_token_ms", predicted_per_token_ms},
|
||||
{"predicted_per_second", predicted_per_second},
|
||||
};
|
||||
|
||||
if (draft_n > 0) {
|
||||
base["draft_n"] = draft_n;
|
||||
base["draft_n_accepted"] = draft_n_accepted;
|
||||
}
|
||||
|
||||
return base;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1299,6 +1311,10 @@ struct server_slot {
|
||||
|
||||
std::function<void(int)> callback_on_release;
|
||||
|
||||
// Speculative decoding stats
|
||||
int32_t n_draft_total = 0; // Total draft tokens generated
|
||||
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
|
||||
|
||||
void reset() {
|
||||
SLT_DBG(*this, "%s", "\n");
|
||||
|
||||
@@ -1315,6 +1331,10 @@ struct server_slot {
|
||||
|
||||
generated_tokens.clear();
|
||||
generated_token_probs.clear();
|
||||
|
||||
// clear speculative decoding stats
|
||||
n_draft_total = 0;
|
||||
n_draft_accepted = 0;
|
||||
}
|
||||
|
||||
bool is_non_causal() const {
|
||||
@@ -1381,6 +1401,12 @@ struct server_slot {
|
||||
timings.predicted_per_token_ms = t_token_generation / n_decoded;
|
||||
timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
|
||||
|
||||
// Add speculative metrics
|
||||
if (n_draft_total > 0) {
|
||||
timings.draft_n = n_draft_total;
|
||||
timings.draft_n_accepted = n_draft_accepted;
|
||||
}
|
||||
|
||||
return timings;
|
||||
}
|
||||
|
||||
@@ -1428,6 +1454,15 @@ struct server_slot {
|
||||
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
|
||||
t_token_generation, n_decoded, t_gen, n_gen_second,
|
||||
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
||||
|
||||
if (n_draft_total > 0) {
|
||||
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
|
||||
SLT_INF(*this,
|
||||
"\n"
|
||||
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
|
||||
draft_ratio, n_draft_accepted, n_draft_total
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
json to_json() const {
|
||||
@@ -1842,7 +1877,7 @@ struct server_context {
|
||||
}
|
||||
|
||||
bool load_model(const common_params & params) {
|
||||
SRV_INF("loading model '%s'\n", params.model.c_str());
|
||||
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
||||
|
||||
params_base = params;
|
||||
|
||||
@@ -1852,7 +1887,7 @@ struct server_context {
|
||||
ctx = llama_init.context.get();
|
||||
|
||||
if (model == nullptr) {
|
||||
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
||||
SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1863,16 +1898,13 @@ struct server_context {
|
||||
add_bos_token = llama_vocab_get_add_bos(vocab);
|
||||
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
||||
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
|
||||
|
||||
auto params_dft = params_base;
|
||||
|
||||
params_dft.devices = params_base.speculative.devices;
|
||||
params_dft.hf_file = params_base.speculative.hf_file;
|
||||
params_dft.hf_repo = params_base.speculative.hf_repo;
|
||||
params_dft.model = params_base.speculative.model;
|
||||
params_dft.model_url = params_base.speculative.model_url;
|
||||
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||
params_dft.n_parallel = 1;
|
||||
@@ -1886,12 +1918,12 @@ struct server_context {
|
||||
model_dft = llama_init_dft.model.get();
|
||||
|
||||
if (model_dft == nullptr) {
|
||||
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
||||
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
|
||||
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
||||
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -3290,6 +3322,9 @@ struct server_context {
|
||||
|
||||
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
|
||||
|
||||
// keep track of total number of tokens generated in the draft
|
||||
slot.n_draft_total += draft.size();
|
||||
|
||||
// ignore small drafts
|
||||
if (slot.params.speculative.n_min > (int) draft.size()) {
|
||||
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
|
||||
@@ -3315,6 +3350,9 @@ struct server_context {
|
||||
slot.n_past += ids.size();
|
||||
slot.n_decoded += ids.size();
|
||||
|
||||
// update how many tokens out of draft was accepted
|
||||
slot.n_draft_accepted += ids.size() - 1;
|
||||
|
||||
slot.cache_tokens.push_back(id);
|
||||
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
|
||||
|
||||
@@ -3825,7 +3863,7 @@ int main(int argc, char ** argv) {
|
||||
json data = {
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "model_path", ctx_server.params_base.model },
|
||||
{ "model_path", ctx_server.params_base.model.path },
|
||||
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
||||
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
||||
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
||||
@@ -4091,7 +4129,7 @@ int main(int argc, char ** argv) {
|
||||
{"object", "list"},
|
||||
{"data", {
|
||||
{
|
||||
{"id", params.model_alias.empty() ? params.model : params.model_alias},
|
||||
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
|
||||
{"object", "model"},
|
||||
{"created", std::time(0)},
|
||||
{"owned_by", "llamacpp"},
|
||||
@@ -4459,15 +4497,24 @@ int main(int argc, char ** argv) {
|
||||
llama_backend_free();
|
||||
};
|
||||
|
||||
// bind HTTP listen port
|
||||
bool was_bound = false;
|
||||
if (params.port == 0) {
|
||||
int bound_port = svr->bind_to_any_port(params.hostname);
|
||||
if ((was_bound = (bound_port >= 0))) {
|
||||
params.port = bound_port;
|
||||
}
|
||||
if (string_ends_with(std::string(params.hostname), ".sock")) {
|
||||
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
|
||||
svr->set_address_family(AF_UNIX);
|
||||
// bind_to_port requires a second arg, any value other than 0 should
|
||||
// simply get ignored
|
||||
was_bound = svr->bind_to_port(params.hostname, 8080);
|
||||
} else {
|
||||
was_bound = svr->bind_to_port(params.hostname, params.port);
|
||||
LOG_INF("%s: binding port with default address family\n", __func__);
|
||||
// bind HTTP listen port
|
||||
if (params.port == 0) {
|
||||
int bound_port = svr->bind_to_any_port(params.hostname);
|
||||
if ((was_bound = (bound_port >= 0))) {
|
||||
params.port = bound_port;
|
||||
}
|
||||
} else {
|
||||
was_bound = svr->bind_to_port(params.hostname, params.port);
|
||||
}
|
||||
}
|
||||
|
||||
if (!was_bound) {
|
||||
|
||||
@@ -58,6 +58,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
||||
|
||||
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||
|
||||
// thin wrapper around common_grammar_trigger with (de)serialization functions
|
||||
struct server_grammar_trigger {
|
||||
common_grammar_trigger value;
|
||||
|
||||
server_grammar_trigger() = default;
|
||||
server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
|
||||
server_grammar_trigger(const json & in) {
|
||||
value.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
||||
value.value = in.at("value").get<std::string>();
|
||||
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
value.token = (llama_token) in.at("token").get<int>();
|
||||
}
|
||||
}
|
||||
|
||||
json to_json() const {
|
||||
json out {
|
||||
{"type", (int) value.type},
|
||||
{"value", value.value},
|
||||
};
|
||||
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
out["token"] = (int) value.token;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// tokenizer and input processing utils
|
||||
//
|
||||
@@ -627,7 +653,8 @@ static json oaicompat_completion_params_parse(
|
||||
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : chat_params.grammar_triggers) {
|
||||
grammar_triggers.push_back(trigger.to_json<json>());
|
||||
server_grammar_trigger ct(trigger);
|
||||
grammar_triggers.push_back(ct.to_json());
|
||||
}
|
||||
llama_params["grammar_triggers"] = grammar_triggers;
|
||||
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
||||
|
||||
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.speculative.model.empty()) {
|
||||
if (params.speculative.model.path.empty()) {
|
||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.speculative.model.empty()) {
|
||||
if (params.speculative.model.path.empty()) {
|
||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -577,12 +577,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
|
||||
|
||||
// TODO: refactor in a common struct
|
||||
params.model = params.vocoder.model;
|
||||
params.model_url = params.vocoder.model_url;
|
||||
params.hf_repo = params.vocoder.hf_repo;
|
||||
params.hf_file = params.vocoder.hf_file;
|
||||
|
||||
params.model = params.vocoder.model;
|
||||
params.embedding = true;
|
||||
|
||||
common_init_result llama_init_cts = common_init_from_params(params);
|
||||
@@ -699,11 +694,13 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
||||
const std::string voice_data = audio_data;
|
||||
|
||||
auto tmp = common_tokenize(vocab, voice_data, false, true);
|
||||
printf("\n\n");
|
||||
|
||||
std::ostringstream tokens_oss;
|
||||
for (size_t i = 0; i < tmp.size(); ++i) {
|
||||
printf("%d, ", tmp[i]);
|
||||
tokens_oss << tmp[i] << ", ";
|
||||
}
|
||||
printf("\n\n");
|
||||
LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
|
||||
|
||||
prompt_add(prompt_inp, tmp);
|
||||
#else
|
||||
prompt_add(prompt_inp, llama_tokens {
|
||||
|
||||
@@ -100,6 +100,10 @@ else()
|
||||
set(INS_ENB ON)
|
||||
endif()
|
||||
|
||||
message(DEBUG "GGML_NATIVE : ${GGML_NATIVE}")
|
||||
message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
|
||||
message(DEBUG "INS_ENB : ${INS_ENB}")
|
||||
|
||||
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
||||
@@ -127,7 +131,8 @@ option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
||||
|
||||
|
||||
if (WIN32)
|
||||
|
||||
22
ggml/cmake/GitVars.cmake
Normal file
22
ggml/cmake/GitVars.cmake
Normal file
@@ -0,0 +1,22 @@
|
||||
find_package(Git)
|
||||
|
||||
# the commit's SHA1
|
||||
execute_process(COMMAND
|
||||
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||
OUTPUT_VARIABLE GIT_SHA1
|
||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
# the date of the commit
|
||||
execute_process(COMMAND
|
||||
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||
OUTPUT_VARIABLE GIT_DATE
|
||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
# the subject of the commit
|
||||
execute_process(COMMAND
|
||||
"${GIT_EXECUTABLE}" log -1 --format=%s
|
||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
||||
set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
|
||||
@@ -17,7 +17,9 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
|
||||
const char * cache_dir,
|
||||
size_t free_mem, size_t total_mem);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||
|
||||
|
||||
@@ -1791,11 +1791,11 @@ extern "C" {
|
||||
|
||||
#define GGML_KQ_MASK_PAD 64
|
||||
|
||||
// q: [n_embd, n_batch, n_head, 1]
|
||||
// k: [n_embd, n_kv, n_head_kv, 1]
|
||||
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
|
||||
// q: [n_embd_k, n_batch, n_head, 1]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, 1]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
|
||||
@@ -65,7 +65,7 @@ if (GGML_LTO)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_CCACHE)
|
||||
if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
|
||||
find_program(GGML_CCACHE_FOUND ccache)
|
||||
find_program(GGML_SCCACHE_FOUND sccache)
|
||||
|
||||
|
||||
@@ -1,168 +0,0 @@
|
||||
---
|
||||
Language: Cpp
|
||||
# BasedOnStyle: Google
|
||||
AccessModifierOffset: -1
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveMacros: false
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: true
|
||||
AllowAllArgumentsOnNextLine: true
|
||||
AllowAllConstructorInitializersOnNextLine: true
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortBlocksOnASingleLine: Never
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: All
|
||||
AllowShortLambdasOnASingleLine: All
|
||||
AllowShortIfStatementsOnASingleLine: WithoutElse
|
||||
AllowShortLoopsOnASingleLine: true
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
AlwaysBreakTemplateDeclarations: Yes
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BraceWrapping:
|
||||
AfterCaseLabel: false
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: false
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Attach
|
||||
BreakBeforeInheritanceComma: false
|
||||
BreakInheritanceList: BeforeColon
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakConstructorInitializers: BeforeColon
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: true
|
||||
ColumnLimit: 80
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: true
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: true
|
||||
DeriveLineEnding: true
|
||||
DerivePointerAlignment: true
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: true
|
||||
ForEachMacros:
|
||||
- foreach
|
||||
- Q_FOREACH
|
||||
- BOOST_FOREACH
|
||||
IncludeBlocks: Regroup
|
||||
IncludeCategories:
|
||||
- Regex: '^<ext/.*\.h>'
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 1
|
||||
SortPriority: 0
|
||||
- Regex: '^<.*'
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
- Regex: '.*'
|
||||
Priority: 3
|
||||
SortPriority: 0
|
||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||
IncludeIsMainSourceRegex: ''
|
||||
IndentCaseLabels: true
|
||||
IndentGotoLabels: true
|
||||
IndentPPDirectives: None
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBinPackProtocolList: Never
|
||||
ObjCBlockIndentWidth: 2
|
||||
ObjCSpaceAfterProperty: false
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
PenaltyBreakAssignment: 2
|
||||
PenaltyBreakBeforeFirstCallParameter: 1
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyBreakTemplateDeclaration: 10
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Left
|
||||
RawStringFormats:
|
||||
- Language: Cpp
|
||||
Delimiters:
|
||||
- cc
|
||||
- CC
|
||||
- cpp
|
||||
- Cpp
|
||||
- CPP
|
||||
- 'c++'
|
||||
- 'C++'
|
||||
CanonicalDelimiter: ''
|
||||
BasedOnStyle: google
|
||||
- Language: TextProto
|
||||
Delimiters:
|
||||
- pb
|
||||
- PB
|
||||
- proto
|
||||
- PROTO
|
||||
EnclosingFunctions:
|
||||
- EqualsProto
|
||||
- EquivToProto
|
||||
- PARSE_PARTIAL_TEXT_PROTO
|
||||
- PARSE_TEST_PROTO
|
||||
- PARSE_TEXT_PROTO
|
||||
- ParseTextOrDie
|
||||
- ParseTextProtoOrDie
|
||||
CanonicalDelimiter: ''
|
||||
BasedOnStyle: google
|
||||
ReflowComments: true
|
||||
SortIncludes: true
|
||||
SortUsingDeclarations: true
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterLogicalNot: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeCpp11BracedList: false
|
||||
SpaceBeforeCtorInitializerColon: true
|
||||
SpaceBeforeInheritanceColon: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceBeforeRangeBasedForLoopColon: true
|
||||
SpaceInEmptyBlock: false
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 2
|
||||
SpacesInAngles: false
|
||||
SpacesInConditionalStatement: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
SpaceBeforeSquareBrackets: false
|
||||
Standard: Auto
|
||||
StatementMacros:
|
||||
- Q_UNUSED
|
||||
- QT_REQUIRE_VERSION
|
||||
TabWidth: 8
|
||||
UseCRLF: false
|
||||
UseTab: Never
|
||||
...
|
||||
|
||||
@@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
|
||||
${CANN_INSTALL_DIR}/acllib/include
|
||||
)
|
||||
|
||||
add_subdirectory(kernels)
|
||||
list(APPEND CANN_LIBRARIES
|
||||
ascendcl
|
||||
nnopbase
|
||||
opapi
|
||||
acl_op_compiler
|
||||
ascendc_kernels
|
||||
)
|
||||
|
||||
file(GLOB GGML_SOURCES_CANN "*.cpp")
|
||||
|
||||
@@ -54,9 +54,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
// added.
|
||||
int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
|
||||
|
||||
int64_t acl_storage_len = 0;
|
||||
if (ne == nullptr) {
|
||||
acl_storage_len = ggml_nbytes(tensor);
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
acl_ne[i] = tensor->ne[i];
|
||||
// The step size of acl is in elements.
|
||||
@@ -65,14 +63,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
} else {
|
||||
// With bcast
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (ne[i] - 1) * nb[i];
|
||||
acl_ne[i] = ne[i];
|
||||
acl_stride[i] = nb[i] / ggml_element_size(tensor);
|
||||
}
|
||||
}
|
||||
|
||||
// Reverse ne and stride.
|
||||
int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
|
||||
int64_t acl_storage_len = 1;
|
||||
for (int i = 0; i < final_dims; i++) {
|
||||
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
|
||||
}
|
||||
|
||||
// Reverse ne and stride.
|
||||
std::reverse(acl_ne, acl_ne + final_dims);
|
||||
std::reverse(acl_stride, acl_stride + final_dims);
|
||||
|
||||
|
||||
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
||||
tmp_stride[i] = nb[i] / type_size;
|
||||
}
|
||||
|
||||
int64_t acl_storage_len = 1;
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
|
||||
}
|
||||
|
||||
std::reverse(tmp_ne, tmp_ne + dims);
|
||||
std::reverse(tmp_stride, tmp_stride + dims);
|
||||
|
||||
int64_t acl_storage_len = 0;
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (ne[i] - 1) * nb[i];
|
||||
}
|
||||
|
||||
aclTensor* acl_tensor =
|
||||
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
||||
format, &acl_storage_len, 1, data_ptr);
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include <aclnnop/aclnn_copy.h>
|
||||
#include <aclnnop/aclnn_cos.h>
|
||||
#include <aclnnop/aclnn_div.h>
|
||||
#include <aclnnop/aclnn_embedding.h>
|
||||
#include <aclnnop/aclnn_exp.h>
|
||||
#include <aclnnop/aclnn_fill_scalar.h>
|
||||
#include <aclnnop/aclnn_group_norm.h>
|
||||
@@ -58,7 +59,6 @@
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "kernels/ascendc_kernels.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
|
||||
@@ -99,6 +99,35 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
ACL_CHECK(aclDestroyIntArray(repeats));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Casts the elements of a tensor to a specified data type using the CANN backend.
|
||||
*
|
||||
* @details This function performs a type conversion on the elements of the input tensor `acl_src`
|
||||
* and stores the results in the destination tensor `acl_dst`. The conversion type is
|
||||
* determined based on the `dst` tensor's data type.
|
||||
*
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
* @param acl_src The source tensor whose elements will be cast.
|
||||
* @param acl_dst The destination tensor that will store the casted elements.
|
||||
* @param dst The ggml tensor specifying the target data type.
|
||||
*/
|
||||
static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
aclTensor* acl_dst, ggml_tensor* dst) {
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src,
|
||||
ggml_cann_type_mapping(dst->type),
|
||||
acl_dst, &workspaceSize, &executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
GGML_ASSERT(ggml_can_repeat(src, dst));
|
||||
@@ -329,8 +358,6 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
float min;
|
||||
float max;
|
||||
@@ -889,173 +916,76 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
}
|
||||
|
||||
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
src->extra = src_extra_allocator.get();
|
||||
dst->extra = dst_extra_allocator.get();
|
||||
ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
|
||||
if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
|
||||
ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
// TODO: simplify
|
||||
if (src->type == GGML_TYPE_F16) {
|
||||
if (dst->type == GGML_TYPE_Q8_0) {
|
||||
aclrtlaunch_ascendc_quantize_f16_q8_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_Q4_0) {
|
||||
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp16(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
// TODO
|
||||
GGML_ABORT("fatal error");
|
||||
} else if (src->type == GGML_TYPE_F32) {
|
||||
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
|
||||
// && nb0 == type_size)
|
||||
if (dst->type == GGML_TYPE_Q8_0) {
|
||||
aclrtlaunch_ascendc_quantize_f32_q8_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_Q4_0) {
|
||||
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp32(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
// TODO: dst not contiguous
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
// TODO
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
if (ggml_are_same_shape(src0, dst)) {
|
||||
if (dst->type == src0->type) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
} else {
|
||||
aclnn_cast(ctx, acl_src, acl_dst, dst);
|
||||
}
|
||||
} else {
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
||||
if (dst->type == src0->type) {
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
dst->data, cpy_size, src0->data, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||
return;
|
||||
} else {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(),
|
||||
ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
dst->data, cpy_size, src_trans_buffer, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
||||
return;
|
||||
}
|
||||
} else if (ggml_is_contiguous(dst)) {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
|
||||
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
|
||||
cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
||||
return;
|
||||
} else {
|
||||
GGML_ABORT("Unsupport dst is not tontiguous.");
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -1158,8 +1088,6 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
GGML_ASSERT(eps > 0.0f);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
@@ -2378,85 +2306,168 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
|
||||
}
|
||||
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1];
|
||||
/**
|
||||
* @brief Performs embedding operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function extracts slices from the source tensor (`src_buffer`),
|
||||
* index tensor (`index`), and destination tensor (`dst`), and performs an
|
||||
* embedding operation on them. The embedding operation is applied by iterating
|
||||
* over the last two dimensions of the source tensor, creating the necessary
|
||||
* tensors for the source, index, and output, and executing the embedding operation.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer holding the data for the source tensor.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param index The index tensor used in the embedding operation.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
*/
|
||||
static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
|
||||
int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
|
||||
ggml_tensor* dst) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
|
||||
size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
|
||||
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
||||
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_src_ne, acl_src_nb, 2);
|
||||
|
||||
ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
src0->extra = src0_extra_allocator.get();
|
||||
src1->extra = src1_extra_allocator.get();
|
||||
dst->extra = dst_extra_allocator.get();
|
||||
ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
// index
|
||||
int64_t acl_index_ne[1] = {index->ne[0]};
|
||||
size_t acl_index_nb[1] = {index->nb[0]};
|
||||
aclTensor* acl_index = ggml_cann_create_tensor(
|
||||
(char*)index->data + i * index->nb[2] + j * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
||||
acl_index_ne, acl_index_nb, 1);
|
||||
|
||||
// out
|
||||
int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
|
||||
size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
|
||||
aclTensor* acl_out = ggml_cann_create_tensor(
|
||||
(char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_out_ne, acl_out_nb, 2);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(aclnnEmbeddingGetWorkspaceSize(
|
||||
acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
|
||||
workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor,
|
||||
ctx.stream()));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_index));
|
||||
ACL_CHECK(aclDestroyTensor(acl_out));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0]; // src
|
||||
ggml_tensor* src1 = dst->src[1]; // index
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
#ifdef ASCEND_310P
|
||||
// Special operation for get_row_f32 kernel of 310P: clear the
|
||||
// content of dest data buffer when row is not aligned to 32 bytes
|
||||
if ((src0->ne[0] % 8) != 0) {
|
||||
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
|
||||
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||
}
|
||||
#endif
|
||||
aclrtlaunch_ascendc_get_row_f32(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src0->extra)->nb,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
|
||||
dst);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
#ifdef ASCEND_310P
|
||||
// Special operation for get_row_f16 kernel of 310P: clear the
|
||||
// content of dest data buffer when row is not aligned to 32 bytes
|
||||
if ((src0->ne[0] % 16) != 0) {
|
||||
size_t dst_len =
|
||||
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
|
||||
ggml_type_size(
|
||||
GGML_TYPE_F32); // out is also f32, even input is f16
|
||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
#endif
|
||||
aclrtlaunch_ascendc_get_row_f16(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src0->extra)->nb,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
|
||||
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0, src_trans_tensor, dst);
|
||||
aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
|
||||
src_trans_nb, src1, dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src0));
|
||||
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q4_0:
|
||||
aclrtlaunch_ascendc_get_row_q4_0(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
aclrtlaunch_ascendc_get_row_q8_0(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
case GGML_TYPE_Q8_0: {
|
||||
// add 1 dim for bcast mul.
|
||||
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
|
||||
dequant_nb[GGML_MAX_DIMS + 1];
|
||||
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
|
||||
*dequant_ne;
|
||||
int64_t scale_offset = 0;
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,32]
|
||||
weight_ne[0] = QK8_0;
|
||||
weight_ne[1] = src0->ne[0] / QK8_0;
|
||||
weight_nb[0] = sizeof(int8_t);
|
||||
weight_nb[1] = weight_nb[0] * weight_ne[0];
|
||||
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
||||
weight_ne[i] = src0->ne[i - 1];
|
||||
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
|
||||
}
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,1]
|
||||
scale_ne[0] = 1;
|
||||
scale_ne[1] = src0->ne[0] / QK8_0;
|
||||
scale_nb[0] = sizeof(uint16_t);
|
||||
scale_nb[1] = scale_nb[0] * scale_ne[0];
|
||||
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
||||
scale_ne[i] = src0->ne[i - 1];
|
||||
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,32]
|
||||
dequant_ne = weight_ne;
|
||||
dequant_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
||||
ggml_cann_pool_alloc dequant_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
||||
|
||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
|
||||
GGML_MAX_DIMS + 1);
|
||||
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||
src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
|
||||
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
||||
aclTensor* dequant_tensor = ggml_cann_create_tensor(
|
||||
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
|
||||
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
||||
|
||||
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
|
||||
dequant_nb[0] = sizeof(float_t);
|
||||
dequant_ne = src0->ne;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
|
||||
aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
|
||||
dequant_ne, dequant_nb, src1, dst);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(dequant_tensor));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -2797,8 +2808,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||
nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
|
||||
&workspaceSize, &executor));
|
||||
nullptr, nullptr, nullptr, antiquantGroupSize,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
if (workspaceAddr == nullptr) {
|
||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||
}
|
||||
@@ -3137,7 +3148,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
// TODO: use ascendc
|
||||
// Only test with LLAMA model.
|
||||
ggml_tensor* src0 = dst->src[0]; // input
|
||||
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
||||
// ggml_tensor* src2 = dst->src[2]; // freq_factors, not used now.
|
||||
|
||||
// param
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
|
||||
@@ -535,9 +535,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
|
||||
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
@@ -566,9 +563,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
|
||||
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
|
||||
@@ -1458,11 +1458,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
||||
ACL_CHECK(aclrtSynchronizeDevice());
|
||||
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
|
||||
|
||||
// finalize when last backend freed.
|
||||
if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
|
||||
ACL_CHECK(aclFinalize());
|
||||
}
|
||||
|
||||
delete cann_ctx;
|
||||
delete backend;
|
||||
}
|
||||
@@ -1688,11 +1683,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
// only support contiguous for quantized types.
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op->src[1]);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -1704,7 +1702,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
default:
|
||||
@@ -1712,16 +1709,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
ggml_tensor *src = op->src[0];
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
(src->type != GGML_TYPE_F32 &&
|
||||
src->type != GGML_TYPE_F16)) {
|
||||
// only support F32 and F16.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
|
||||
// unsupport dst is not contiguous.
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} break;
|
||||
case GGML_OP_CONT: {
|
||||
// TODO: support GGML_TYPE_BF16
|
||||
switch (op->src[0]->type) {
|
||||
@@ -1734,13 +1736,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
// TODO: with ops-test v == 1
|
||||
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
||||
float ext_factor = 0.0f;
|
||||
memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
|
||||
// TODO: n_dims <= ne0
|
||||
if (op->src[0]->ne[0] != op->op_params[1]) {
|
||||
return false;
|
||||
}
|
||||
// TODO: ext_factor != 0
|
||||
if (*ext_factor != 0) {
|
||||
if (ext_factor != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1762,9 +1765,19 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_POOL_2D: {
|
||||
const int32_t * opts = (const int32_t *) op->op_params;
|
||||
const int k0 = opts[1];
|
||||
const int k1 = opts[2];
|
||||
const int p0 = opts[5];
|
||||
const int p1 = opts[6];
|
||||
// value of paddingH should be at most half of kernelH
|
||||
// value of paddingW should be at most half of kernelW
|
||||
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
|
||||
}
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
@@ -1781,7 +1794,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
file(GLOB SRC_FILES
|
||||
get_row_f32.cpp
|
||||
get_row_f16.cpp
|
||||
get_row_q4_0.cpp
|
||||
get_row_q8_0.cpp
|
||||
quantize_f32_q8_0.cpp
|
||||
quantize_f16_q8_0.cpp
|
||||
quantize_float_to_q4_0.cpp
|
||||
dup.cpp
|
||||
)
|
||||
|
||||
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
|
||||
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
|
||||
|
||||
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
|
||||
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
|
||||
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
|
||||
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
|
||||
else()
|
||||
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
|
||||
endif()
|
||||
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
|
||||
|
||||
ascendc_library(ascendc_kernels STATIC
|
||||
${SRC_FILES}
|
||||
)
|
||||
|
||||
message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
|
||||
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
||||
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
||||
@@ -1,19 +0,0 @@
|
||||
#ifndef ASCENDC_KERNELS_H
|
||||
#define ASCENDC_KERNELS_H
|
||||
|
||||
#include "aclrtlaunch_ascendc_get_row_f32.h"
|
||||
#include "aclrtlaunch_ascendc_get_row_f16.h"
|
||||
#include "aclrtlaunch_ascendc_get_row_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_get_row_q4_0.h"
|
||||
|
||||
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
|
||||
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
|
||||
|
||||
#endif // ASCENDC_KERNELS_H
|
||||
@@ -1,234 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
|
||||
|
||||
template <typename SRC_T, typename DST_T>
|
||||
class DupByRows {
|
||||
public:
|
||||
__aicore__ inline DupByRows() {}
|
||||
__aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
|
||||
size_t *input_nb_ub) {
|
||||
/* Dup by rows when src is contigous on first dimension and dst is
|
||||
contiguous, each kernel process one row.
|
||||
*/
|
||||
|
||||
// Input has four dims.
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
// param
|
||||
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
|
||||
num_elem = input_ne_ub[0];
|
||||
|
||||
// index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
|
||||
idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
|
||||
idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
|
||||
/ (input_ne_ub[1]);
|
||||
idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
|
||||
- idx_ne2 * input_ne_ub[1];
|
||||
|
||||
// src may not contiguous in dim [1,2,3], so stride decited by ne&nb
|
||||
src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
|
||||
+ input_nb_ub[1] * idx_ne1;
|
||||
|
||||
// dst is contiguous
|
||||
dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
|
||||
|
||||
src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
|
||||
src_stride));
|
||||
dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
|
||||
dst_stride));
|
||||
|
||||
pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
|
||||
32 - 1) / 32 * 32);
|
||||
pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
|
||||
32 - 1) / 32 * 32);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in() {
|
||||
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
|
||||
const size_t elem_per_block = 32 / sizeof(SRC_T);
|
||||
size_t tail = num_elem % elem_per_block;
|
||||
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
|
||||
DataCopy(src_local, src_gm, cpy_elements_len);
|
||||
src_queue.EnQue(src_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out() {
|
||||
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
|
||||
#ifdef ASCEND_310P
|
||||
const size_t elem_per_block = 32 / sizeof(DST_T);
|
||||
size_t tail = num_elem % elem_per_block;
|
||||
size_t len = num_elem & ~(elem_per_block - 1);
|
||||
if (len > 0) {
|
||||
DataCopy(dst_gm, dst_local, len);
|
||||
}
|
||||
if(tail != 0) {
|
||||
for (size_t i = tail; i < elem_per_block; i++) {
|
||||
dst_local[len + i].SetValue(0, 0);
|
||||
}
|
||||
SetAtomicAdd<float>();
|
||||
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
|
||||
SetAtomicNone();
|
||||
}
|
||||
#else
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
|
||||
DataCopyPad(dst_gm, dst_local, dataCopyParams);
|
||||
#endif
|
||||
dst_queue.FreeTensor(dst_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void dup() {
|
||||
// main process, copy one row data from src to dst.
|
||||
copy_in();
|
||||
|
||||
LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
|
||||
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
|
||||
|
||||
int32_t BLOCK_NUM = 32 / sizeof(DST_T);
|
||||
DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
|
||||
/ BLOCK_NUM * BLOCK_NUM);
|
||||
dst_queue.EnQue<DST_T>(dst_local);
|
||||
|
||||
src_queue.FreeTensor(src_local);
|
||||
copy_out();
|
||||
}
|
||||
|
||||
__aicore__ inline void dup_with_cast() {
|
||||
// main process, copy one row data from src to dst.
|
||||
// cast dtype from src to dst.
|
||||
copy_in();
|
||||
|
||||
LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
|
||||
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
|
||||
|
||||
Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
|
||||
dst_queue.EnQue<DST_T>(dst_local);
|
||||
|
||||
src_queue.FreeTensor(src_local);
|
||||
copy_out();
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<SRC_T> src_gm;
|
||||
GlobalTensor<DST_T> dst_gm;
|
||||
|
||||
int64_t num_rows;
|
||||
int64_t num_elem;
|
||||
int64_t idx_ne3;
|
||||
int64_t idx_ne2;
|
||||
int64_t idx_ne1;
|
||||
int64_t src_stride;
|
||||
int64_t dst_stride;
|
||||
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<half, half> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<float, float> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<float, half> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup_with_cast();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
|
||||
// copy params from gm to ub.
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<half, float> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup_with_cast();
|
||||
}
|
||||
@@ -1,197 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
class GET_ROW_F16 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_F16() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
||||
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
||||
// TODO, use template for F16/f32
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ half *)input);
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
|
||||
& ~31);
|
||||
uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
|
||||
& ~31);
|
||||
|
||||
local_buffer_elems = input_local_buffer_size / sizeof(half);
|
||||
|
||||
// TODO, consider long row that can't put in UB.
|
||||
// All data should asign to 32. It's ok because all data is align to 32.
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
||||
size_t origin_len = len;
|
||||
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
|
||||
const size_t elem_per_block = 32 / sizeof(half);
|
||||
size_t tail = len % elem_per_block;
|
||||
len = len & ~(elem_per_block - 1);
|
||||
if(tail != 0) {
|
||||
len += elem_per_block;
|
||||
}
|
||||
DataCopy(input_local, input_gm[offset], len);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
const size_t elem_per_block = 32 / sizeof(float);
|
||||
size_t tail = len % elem_per_block;
|
||||
len = len & ~(elem_per_block - 1);
|
||||
if (len > 0) {
|
||||
DataCopy(output_gm[offset], output_local, len);
|
||||
}
|
||||
|
||||
if(tail != 0) {
|
||||
#ifdef ASCEND_310P
|
||||
for (size_t i = tail; i < elem_per_block; i++) {
|
||||
output_local[len + i].SetValue(0, 0);
|
||||
}
|
||||
SetAtomicAdd<float>();
|
||||
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
|
||||
SetAtomicNone();
|
||||
#else
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = tail * sizeof(float);
|
||||
DataCopyPad(output_gm[offset + len], output_local[len],
|
||||
dataCopyParams);
|
||||
#endif
|
||||
}
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_row(int64_t idx) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3];
|
||||
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3];
|
||||
|
||||
copy_in(input_offset, input_ne[0]);
|
||||
LocalTensor<half> input_local = input_queue.DeQue<half>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
Cast(output_local, input_local, RoundMode::CAST_NONE,
|
||||
local_buffer_elems);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset, input_ne[0]);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
calculate_row(i);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
size_t local_buffer_elems;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<half> input_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
int64_t op_block_idx;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_f16(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
|
||||
GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_F16 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
|
||||
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
||||
@@ -1,190 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
class GET_ROW_F32 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_F32() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
||||
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ float *)input);
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
|
||||
local_buffer_elems = local_buffer_size / sizeof(float);
|
||||
|
||||
// TODO, consider long row that can't put in UB.
|
||||
// All data should asign to 32. It's ok because all data is align to 32.
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
||||
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
||||
const size_t elem_per_block = 32 / sizeof(float);
|
||||
size_t tail = len % elem_per_block;
|
||||
len = len & ~(elem_per_block - 1);
|
||||
if(tail != 0) {
|
||||
len += elem_per_block;
|
||||
}
|
||||
DataCopy(input_local, input_gm[offset], len);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
const size_t elem_per_block = 32 / sizeof(float);
|
||||
size_t tail = len % elem_per_block;
|
||||
len = len & ~(elem_per_block - 1);
|
||||
if (len > 0) {
|
||||
DataCopy(output_gm[offset], output_local, len);
|
||||
}
|
||||
|
||||
if(tail != 0) {
|
||||
#ifdef ASCEND_310P
|
||||
for (size_t i = tail; i < elem_per_block; i++) {
|
||||
output_local[len + i].SetValue(0, 0);
|
||||
}
|
||||
SetAtomicAdd<float>();
|
||||
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
|
||||
SetAtomicNone();
|
||||
#else
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = tail * sizeof(float);
|
||||
DataCopyPad(output_gm[offset + len], output_local[len],
|
||||
dataCopyParams);
|
||||
#endif
|
||||
}
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_row(int64_t idx) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3];
|
||||
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3];
|
||||
|
||||
copy_in(input_offset, input_ne[0]);
|
||||
LocalTensor<float> input_local = input_queue.DeQue<float>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
DataCopy(output_local, input_local, local_buffer_elems);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset, input_ne[0]);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
calculate_row(i);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
size_t local_buffer_elems;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<float> input_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
int64_t op_block_idx;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_f32(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
|
||||
GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_F32 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
|
||||
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
||||
@@ -1,204 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
#ifdef ASCEND_310P // 310P not support 4bit get row
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support 4bit get row.\n");
|
||||
}
|
||||
#else
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
#define QK4_0 32
|
||||
|
||||
class GET_ROW_Q4_0 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_Q4_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, int64_t *indices_ne_ub,
|
||||
size_t *indices_nb_ub, int64_t *output_ne_ub,
|
||||
size_t *output_nb_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
scale_ne[i] = input_ne_ub[i];
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// one scale for a group.
|
||||
scale_ne[0] /= QK4_0;
|
||||
|
||||
input_stride[0] = 1;
|
||||
scale_stride[0] = 1;
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
group_size_in_row = input_ne[0] / QK4_0;
|
||||
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
|
||||
input_ne[3] / 2;
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
|
||||
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
|
||||
// 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
|
||||
DataCopy(input_local, input_gm[offset], QK4_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
DataCopy(output_gm[offset], output_local, QK4_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3] +
|
||||
group * QK4_0;
|
||||
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
|
||||
indices_ne1_idx * scale_stride[2] +
|
||||
indices_ne2_idx * scale_stride[3] + group;
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3] +
|
||||
group * QK4_0;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
|
||||
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
// TODO: cast more data to speed up.
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
||||
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
||||
|
||||
// Only mul need compile by group.
|
||||
half scale = scale_gm.GetValue(scale_offset);
|
||||
|
||||
Muls(output_local, output_local, (float)scale, QK4_0);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
output_queue.EnQue(output_local);
|
||||
|
||||
copy_out(output_offset);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
calculate_group(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t scale_ne[4];
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<int4b_t> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_Q4_0 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
|
||||
indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
#endif // #ifdef ASCEND_310P
|
||||
@@ -1,191 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
#define QK8_0 32
|
||||
|
||||
class GET_ROW_Q8_0 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_Q8_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, int64_t *indices_ne_ub,
|
||||
size_t *indices_nb_ub, int64_t *output_ne_ub,
|
||||
size_t *output_nb_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
scale_ne[i] = input_ne_ub[i];
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// one scale for a group.
|
||||
scale_ne[0] /= QK8_0;
|
||||
|
||||
input_stride[0] = 1;
|
||||
scale_stride[0] = 1;
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
group_size_in_row = input_ne[0] / QK8_0;
|
||||
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
|
||||
input_ne[3] * sizeof(int8_t);
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
||||
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
|
||||
DataCopy(input_local, input_gm[offset], QK8_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
DataCopy(output_gm[offset], output_local, QK8_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3] +
|
||||
group * QK8_0;
|
||||
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
|
||||
indices_ne1_idx * scale_stride[2] +
|
||||
indices_ne2_idx * scale_stride[3] + group;
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3] +
|
||||
group * QK8_0;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
|
||||
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
// TODO: cast more data to speed up.
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
|
||||
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
|
||||
|
||||
// Only mul need compile by group.
|
||||
half scale = scale_gm.GetValue(scale_offset);
|
||||
Muls(output_local, output_local, (float)scale, QK8_0);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
output_queue.EnQue(output_local);
|
||||
|
||||
copy_out(output_offset);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
calculate_group(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t scale_ne[4];
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<int8_t> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_Q8_0 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
|
||||
indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
||||
@@ -1,218 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
#ifdef ASCEND_310P
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support f16->8bit quantization.\n");
|
||||
}
|
||||
#else
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
#define QK8_0 32
|
||||
|
||||
class QUANTIZE_F16_Q8_0 {
|
||||
public:
|
||||
__aicore__ inline QUANTIZE_F16_Q8_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *output_ne_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
}
|
||||
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_ne = input_ne;
|
||||
scale_stride[0] = 1;
|
||||
scale_stride[1] = input_ne[0] / QK8_0;
|
||||
for (int i = 2; i < 4; i++) {
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// split input tensor by rows.
|
||||
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||
dr = nr / op_block_num;
|
||||
|
||||
uint64_t tails = nr % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
group_size_in_row = scale_stride[1];
|
||||
int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||
output_ne[3] * sizeof(uint8_t);
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ half *)input);
|
||||
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
|
||||
group_size_in_row *
|
||||
sizeof(half)));
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
||||
pipe.InitBuffer(work_queue, 1, 32);
|
||||
pipe.InitBuffer(max_queue, 1, 32);
|
||||
pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
|
||||
pipe.InitBuffer(scale_queue, 1, 32);
|
||||
pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
|
||||
DataCopy(input_local, input_gm[offset], QK8_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
|
||||
DataCopy(output_gm[offset], output_local, QK8_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||
const int64_t i1 =
|
||||
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||
|
||||
const int64_t input_offset = i1 * input_stride[1] +
|
||||
i2 * input_stride[2] +
|
||||
i3 * input_stride[3] + QK8_0 * group;
|
||||
|
||||
const int64_t output_offset = i1 * output_stride[1] +
|
||||
i2 * output_stride[2] +
|
||||
i3 * output_stride[3] + QK8_0 * group;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<half> input_local = input_queue.DeQue<half>();
|
||||
LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||
LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
|
||||
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||
LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
|
||||
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
|
||||
Abs(abs_local, cast_local, QK8_0);
|
||||
ReduceMax(max_local, abs_local, work_local, QK8_0);
|
||||
|
||||
pipe_barrier(PIPE_ALL);
|
||||
float d = max_local.GetValue(0);
|
||||
d = d / ((1 << 7) - 1);
|
||||
if (d != 0) {
|
||||
Muls(cast_local, cast_local, 1.0f / d, QK8_0);
|
||||
}
|
||||
|
||||
Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
work_queue.FreeTensor(work_local);
|
||||
abs_queue.FreeTensor(abs_local);
|
||||
max_queue.FreeTensor(max_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
return (half)d;
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||
uint32_t scale_local_offset = 0;
|
||||
uint32_t scale_global_offset = 0;
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
half scale = calculate_group(i, j);
|
||||
scale_local.SetValue(scale_local_offset++, scale);
|
||||
if (scale_local_offset == 16) {
|
||||
scale_local_offset = 0;
|
||||
// TODO: OPTIMIZE ME
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopy(scale_gm[scale_global_offset], scale_local, 16);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
scale_global_offset += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scale_local_offset != 0) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||
dataCopyParams);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t *scale_ne;
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<half> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int8_t> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, 1> work_queue;
|
||||
TQue<QuePosition::VECOUT, 1> max_queue;
|
||||
TQue<QuePosition::VECIN, 1> abs_queue;
|
||||
TQue<QuePosition::VECOUT, 1> scale_queue;
|
||||
TQue<QuePosition::VECOUT, 1> cast_queue;
|
||||
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_F16_Q8_0 op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
#endif // #ifdef ASCEND_310P
|
||||
@@ -1,216 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
#ifdef ASCEND_310P // 310P not support f32->8bit quantization
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support f32->8bit quantization.\n");
|
||||
}
|
||||
#else
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
#define QK8_0 32
|
||||
|
||||
class QUANTIZE_F32_Q8_0 {
|
||||
public:
|
||||
__aicore__ inline QUANTIZE_F32_Q8_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *output_ne_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
}
|
||||
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_ne = input_ne;
|
||||
scale_stride[0] = 1;
|
||||
scale_stride[1] = input_ne[0] / QK8_0;
|
||||
for (int i = 2; i < 4; i++) {
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// split input tensor by rows.
|
||||
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||
dr = nr / op_block_num;
|
||||
|
||||
uint64_t tails = nr % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
group_size_in_row = scale_stride[1];
|
||||
int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||
output_ne[3] * sizeof(uint8_t);
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ float *)input);
|
||||
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
|
||||
ir * group_size_in_row *
|
||||
sizeof(half)));
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
||||
pipe.InitBuffer(work_queue, 1, 32);
|
||||
pipe.InitBuffer(max_queue, 1, 32);
|
||||
pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
|
||||
pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
|
||||
pipe.InitBuffer(scale_queue, 1, 32);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
||||
DataCopy(input_local, input_gm[offset], QK8_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
|
||||
DataCopy(output_gm[offset], output_local, QK8_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||
const int64_t i1 =
|
||||
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||
|
||||
const int64_t input_offset = i1 * input_stride[1] +
|
||||
i2 * input_stride[2] +
|
||||
i3 * input_stride[3] + QK8_0 * group;
|
||||
|
||||
const int64_t output_offset = i1 * output_stride[1] +
|
||||
i2 * output_stride[2] +
|
||||
i3 * output_stride[3] + QK8_0 * group;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<float> input_local = input_queue.DeQue<float>();
|
||||
LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||
LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
|
||||
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
||||
|
||||
Abs(abs_local, input_local, QK8_0);
|
||||
ReduceMax(max_local, abs_local, work_local, QK8_0);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
float d = max_local.GetValue(0);
|
||||
d = d / ((1 << 7) - 1);
|
||||
if (d != 0) {
|
||||
Muls(input_local, input_local, 1.0f / d, QK8_0);
|
||||
}
|
||||
|
||||
Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
work_queue.FreeTensor(work_local);
|
||||
abs_queue.FreeTensor(abs_local);
|
||||
max_queue.FreeTensor(max_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
|
||||
return (half)d;
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||
uint32_t scale_local_offset = 0;
|
||||
uint32_t scale_global_offset = 0;
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
half scale = calculate_group(i, j);
|
||||
scale_local.SetValue(scale_local_offset++, scale);
|
||||
if (scale_local_offset == 16) {
|
||||
scale_local_offset = 0;
|
||||
// TODO: OPTIMIZE ME
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopy(scale_gm[scale_global_offset], scale_local, 16);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
scale_global_offset += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scale_local_offset != 0) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||
dataCopyParams);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t *scale_ne;
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<float> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int8_t> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, 1> work_queue;
|
||||
TQue<QuePosition::VECOUT, 1> max_queue;
|
||||
TQue<QuePosition::VECIN, 1> abs_queue;
|
||||
TQue<QuePosition::VECIN, 1> cast_queue;
|
||||
TQue<QuePosition::VECOUT, 1> scale_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_F32_Q8_0 op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
#endif // #ifdef ASCEND_310P
|
||||
@@ -1,295 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
#ifdef ASCEND_310P // 310P not support float->4bit quantization
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support f32->4bit quantization.\n");
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support f16->4bit quantization.\n");
|
||||
}
|
||||
#else
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
#define Group_Size 32
|
||||
|
||||
template <typename SRC_T>
|
||||
class QUANTIZE_FLOAT_TO_Q4_0 {
|
||||
public:
|
||||
__aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *output_ne_ub) {
|
||||
// TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
|
||||
// permute=[0,0,0,0]):
|
||||
// [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
// input stride of data elements
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
}
|
||||
|
||||
// output stride of data elements
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||
}
|
||||
|
||||
// scale saved one by one after data:. [group1_scale, group2_scale, ...]
|
||||
scale_ne = input_ne;
|
||||
scale_stride[0] = 1;
|
||||
scale_stride[1] = input_ne[0] / Group_Size;
|
||||
for (int i = 2; i < 4; i++) {
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// split input tensor by rows.
|
||||
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||
dr = nr / op_block_num;
|
||||
|
||||
uint64_t tails = nr % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
group_size_in_row = scale_stride[1];
|
||||
int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||
output_ne[3] * sizeof(uint8_t) / 2;
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
|
||||
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
|
||||
group_size_in_row *
|
||||
sizeof(half)));
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM,
|
||||
Group_Size * sizeof(int8_t) / 2);
|
||||
pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
|
||||
pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
|
||||
pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
|
||||
DataCopy(input_local, input_gm[offset], Group_Size);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
// reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
|
||||
// and using DataCopyPad to avoid 32 bits align.
|
||||
LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
|
||||
LocalTensor<int8_t> output_int8_local =
|
||||
output_local.ReinterpretCast<int8_t>();
|
||||
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
|
||||
DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
|
||||
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
||||
LocalTensor<float> input_local) {
|
||||
DataCopy(cast_local, input_local, Group_Size);
|
||||
}
|
||||
|
||||
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
||||
LocalTensor<half> input_local) {
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
|
||||
}
|
||||
|
||||
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||
const int64_t i1 =
|
||||
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||
|
||||
const int64_t input_offset = i1 * input_stride[1] +
|
||||
i2 * input_stride[2] +
|
||||
i3 * input_stride[3] + Group_Size * group;
|
||||
|
||||
// output_offset is stride for output_gm which datatype is int8_t and
|
||||
// divided by 2 is needed for int4b_t.
|
||||
const int64_t output_offset = (i1 * output_stride[1] +
|
||||
i2 * output_stride[2] +
|
||||
i3 * output_stride[3] +
|
||||
Group_Size * group) / 2;
|
||||
copy_in(input_offset);
|
||||
|
||||
LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
|
||||
LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
|
||||
LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
|
||||
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||
LocalTensor<float> min_local = min_queue.AllocTensor<float>();
|
||||
LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
|
||||
LocalTensor<half> half_local = half_queue.AllocTensor<half>();
|
||||
|
||||
input_to_cast(cast_local, input_local);
|
||||
|
||||
ReduceMax(max_local, cast_local, work_local, Group_Size);
|
||||
ReduceMin(min_local, cast_local, work_local, Group_Size);
|
||||
const float max_value = max_local.GetValue(0);
|
||||
const float min_value = min_local.GetValue(0);
|
||||
float d = max_value;
|
||||
if (min_value < 0 && (-1 * min_value) > max_value) {
|
||||
d = min_value;
|
||||
}
|
||||
|
||||
d = d / (-8);
|
||||
if (d != 0) {
|
||||
Muls(cast_local, cast_local, 1.0f / d, Group_Size);
|
||||
}
|
||||
|
||||
// range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
|
||||
float scalar = 8.5f;
|
||||
Adds(cast_local, cast_local, scalar, Group_Size);
|
||||
Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
|
||||
scalar = 15.0f;
|
||||
Mins(cast_local, cast_local, scalar, Group_Size);
|
||||
scalar = -8.0f;
|
||||
Adds(cast_local, cast_local, scalar, Group_Size);
|
||||
|
||||
// float->half->int4b
|
||||
Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
|
||||
Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
|
||||
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
work_queue.FreeTensor(work_local);
|
||||
max_queue.FreeTensor(max_local);
|
||||
min_queue.FreeTensor(min_local);
|
||||
int8_queue.FreeTensor(int8_local);
|
||||
half_queue.FreeTensor(half_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
return (half)d;
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||
uint32_t scale_local_offset = 0;
|
||||
uint32_t scale_global_offset = 0;
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
half scale = calculate_group(i, j);
|
||||
scale_local.SetValue(scale_local_offset++, scale);
|
||||
// Copy Group_Size/2 length data each time.
|
||||
if (scale_local_offset == Group_Size / 2) {
|
||||
scale_local_offset = 0;
|
||||
// TODO: OPTIMIZE ME
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopy(scale_gm[scale_global_offset], scale_local,
|
||||
Group_Size / 2);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
scale_global_offset += Group_Size / 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scale_local_offset != 0) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||
dataCopyParams);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
}
|
||||
scale_queue.FreeTensor(scale_local);
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t *scale_ne;
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<SRC_T> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int8_t> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_FLOAT_TO_Q4_0<half> op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_FLOAT_TO_Q4_0<float> op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
#endif // #ifdef ASCEND_310P
|
||||
@@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;
|
||||
|
||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_EXTENSION
|
||||
#else // _MSC_VER
|
||||
#define GGML_EXTENSION __extension__
|
||||
#endif // _MSC_VER
|
||||
|
||||
#define QK4_0 32
|
||||
typedef struct {
|
||||
ggml_half d; // delta
|
||||
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b
|
||||
|
||||
#define QK4_1 32
|
||||
typedef struct {
|
||||
union {
|
||||
GGML_EXTENSION union {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0
|
||||
|
||||
#define QK5_1 32
|
||||
typedef struct {
|
||||
union {
|
||||
GGML_EXTENSION union {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block
|
||||
|
||||
#define QK8_1 32
|
||||
typedef struct {
|
||||
union {
|
||||
GGML_EXTENSION union {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half s; // d * sum(qs[i])
|
||||
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
|
||||
typedef struct {
|
||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||
uint8_t qs[QK_K/4]; // quants
|
||||
union {
|
||||
GGML_EXTENSION union {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 4.5 bits per weight
|
||||
typedef struct {
|
||||
union {
|
||||
GGML_EXTENSION union {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 5.5 bits per weight
|
||||
typedef struct {
|
||||
union {
|
||||
GGML_EXTENSION union {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
|
||||
@@ -23,6 +23,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
ggml-cpu/amx/mmq.cpp
|
||||
ggml-cpu/amx/mmq.h
|
||||
ggml-cpu/ggml-cpu-impl.h
|
||||
ggml-cpu/common.h
|
||||
ggml-cpu/binary-ops.h
|
||||
ggml-cpu/binary-ops.cpp
|
||||
ggml-cpu/unary-ops.h
|
||||
ggml-cpu/unary-ops.cpp
|
||||
)
|
||||
|
||||
target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
|
||||
@@ -289,23 +294,29 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
message(STATUS "PowerPC detected")
|
||||
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
file(READ "/proc/cpuinfo" POWER10_M)
|
||||
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
|
||||
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
endif()
|
||||
if (GGML_NATIVE)
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
file(READ "/proc/cpuinfo" POWER10_M)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
|
||||
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
endif()
|
||||
|
||||
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
|
||||
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
||||
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
|
||||
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
||||
|
||||
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
|
||||
elseif (EXTRACTED_NUMBER EQUAL 9)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
||||
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
|
||||
elseif (EXTRACTED_NUMBER EQUAL 9)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
|
||||
endif()
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
|
||||
if (GGML_CPU_POWERPC_CPUTYPE)
|
||||
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
message(STATUS "loongarch64 detected")
|
||||
|
||||
158
ggml/src/ggml-cpu/binary-ops.cpp
Normal file
158
ggml/src/ggml-cpu/binary-ops.cpp
Normal file
@@ -0,0 +1,158 @@
|
||||
#include "binary-ops.h"
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
|
||||
using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
|
||||
#endif
|
||||
|
||||
static inline float op_add(float a, float b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
static inline float op_sub(float a, float b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
static inline float op_mul(float a, float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
static inline float op_div(float a, float b) {
|
||||
return a / b;
|
||||
}
|
||||
|
||||
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
int i10 = i % ne10;
|
||||
const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
|
||||
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(dst_t));
|
||||
GGML_ASSERT(nb00 == sizeof(src0_t));
|
||||
|
||||
const auto [ir0, ir1] = get_thread_range(params, src0);
|
||||
const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
|
||||
|
||||
if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
vDSP_fn_t vDSP_op = nullptr;
|
||||
// TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
if (op == op_add) {
|
||||
vDSP_op = vDSP_vadd;
|
||||
} else if (op == op_sub) {
|
||||
vDSP_op = vDSP_vsub;
|
||||
} else if (op == op_mul) {
|
||||
vDSP_op = vDSP_vmul;
|
||||
} else if (op == op_div) {
|
||||
vDSP_op = vDSP_vdiv;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
|
||||
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||
|
||||
if (is_src1_contiguous) {
|
||||
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
||||
const int64_t nr0 = ne00 / ne10;
|
||||
|
||||
for (int64_t r = 0; r < nr0; ++r) {
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
|
||||
if (vDSP_op != nullptr) {
|
||||
vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
||||
}
|
||||
} else {
|
||||
vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
|
||||
template <float (*op)(float, float)>
|
||||
static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
/* */ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
||||
apply_binary_op<op, float, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
||||
apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
||||
apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) {
|
||||
apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_add>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_sub>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_mul>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_div>(params, dst);
|
||||
}
|
||||
16
ggml/src/ggml-cpu/binary-ops.h
Normal file
16
ggml/src/ggml-cpu/binary-ops.h
Normal file
@@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
72
ggml/src/ggml-cpu/common.h
Normal file
72
ggml/src/ggml-cpu/common.h
Normal file
@@ -0,0 +1,72 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#include <utility>
|
||||
|
||||
// convenience functions/macros for use in template calls
|
||||
// note: these won't be required after the 'traits' lookup table is used.
|
||||
static inline ggml_fp16_t f32_to_f16(float x) {
|
||||
return GGML_FP32_TO_FP16(x);
|
||||
}
|
||||
|
||||
static inline float f16_to_f32(ggml_fp16_t x) {
|
||||
return GGML_FP16_TO_FP32(x);
|
||||
}
|
||||
|
||||
static inline ggml_bf16_t f32_to_bf16(float x) {
|
||||
return GGML_FP32_TO_BF16(x);
|
||||
}
|
||||
|
||||
static inline float bf16_to_f32(ggml_bf16_t x) {
|
||||
return GGML_BF16_TO_FP32(x);
|
||||
}
|
||||
|
||||
static inline float f32_to_f32(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
// TODO - merge this into the traits table, after using row-based conversions
|
||||
template <class T>
|
||||
struct type_conversion_table;
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<ggml_fp16_t> {
|
||||
static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
|
||||
static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<float> {
|
||||
static constexpr float (*to_f32)(float) = f32_to_f32;
|
||||
static constexpr float (*from_f32)(float) = f32_to_f32;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<ggml_bf16_t> {
|
||||
static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
|
||||
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
||||
};
|
||||
|
||||
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
|
||||
const int64_t ith = params->ith;
|
||||
const int64_t nth = params->nth;
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int64_t dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int64_t ir0 = dr*ith;
|
||||
const int64_t ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
return {ir0, ir1};
|
||||
}
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2680,13 +2680,25 @@ class tinyBLAS_PPC {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
vec_t vec_A[4] {0}, vec_B[4] = {0};
|
||||
for (int l=0; l<k; l+=4) {
|
||||
if (RN >= 4 && RM == 1) {
|
||||
/* 'GEMV Forwarding' concept is used in first two conditional loops.
|
||||
* when one of the matrix has a single row/column, the elements are
|
||||
* broadcasted, instead of using packing routine to prepack the
|
||||
* matrix elements.
|
||||
*/
|
||||
if (RM == 1) {
|
||||
TA* a = const_cast<TA*>(A+(ii)*lda+l);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
|
||||
vec_A[0] = (vec_t)vec_xl(0,a);
|
||||
vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
|
||||
vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
|
||||
vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
|
||||
} else if (RN == 1) {
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
|
||||
TB* b = const_cast<TB*>(B+(jj)*ldb+l);
|
||||
vec_B[0] = (vec_t)vec_xl(0,b);
|
||||
vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1));
|
||||
vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2));
|
||||
vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3));
|
||||
} else {
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
|
||||
@@ -2790,8 +2802,10 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
assert(params->ith < params->nth);
|
||||
|
||||
// only enable sgemm for prompt processing
|
||||
#if !defined(__MMA__)
|
||||
if (n < 2)
|
||||
return false;
|
||||
#endif
|
||||
|
||||
if (Ctype != GGML_TYPE_F32)
|
||||
return false;
|
||||
|
||||
186
ggml/src/ggml-cpu/unary-ops.cpp
Normal file
186
ggml/src/ggml-cpu/unary-ops.cpp
Normal file
@@ -0,0 +1,186 @@
|
||||
#include "unary-ops.h"
|
||||
|
||||
static inline float op_abs(float x) {
|
||||
return fabsf(x);
|
||||
}
|
||||
|
||||
static inline float op_sgn(float x) {
|
||||
return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f);
|
||||
}
|
||||
|
||||
static inline float op_neg(float x) {
|
||||
return -x;
|
||||
}
|
||||
|
||||
static inline float op_step(float x) {
|
||||
return (x > 0.f) ? 1.f : 0.f;
|
||||
}
|
||||
|
||||
static inline float op_tanh(float x) {
|
||||
return tanhf(x);
|
||||
}
|
||||
|
||||
static inline float op_elu(float x) {
|
||||
return (x > 0.f) ? x : expm1f(x);
|
||||
}
|
||||
|
||||
static inline float op_relu(float x) {
|
||||
return (x > 0.f) ? x : 0.f;
|
||||
}
|
||||
|
||||
static inline float op_sigmoid(float x) {
|
||||
return 1.f / (1.f + expf(-x));
|
||||
}
|
||||
|
||||
static inline float op_hardsigmoid(float x) {
|
||||
return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static inline float op_exp(float x) {
|
||||
return expf(x);
|
||||
}
|
||||
|
||||
static inline float op_hardswish(float x) {
|
||||
return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static inline float op_sqr(float x) {
|
||||
return x * x;
|
||||
}
|
||||
|
||||
static inline float op_sqrt(float x) {
|
||||
return sqrtf(x);
|
||||
}
|
||||
|
||||
static inline float op_sin(float x) {
|
||||
return sinf(x);
|
||||
}
|
||||
|
||||
static inline float op_cos(float x) {
|
||||
return cosf(x);
|
||||
}
|
||||
|
||||
static inline float op_log(float x) {
|
||||
return logf(x);
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename src0_t, typename dst_t>
|
||||
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
y[i] = f32_to_dst(op(src0_to_f32(x[i])));
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename src0_t, typename dst_t>
|
||||
static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(dst_t));
|
||||
GGML_ASSERT(nb00 == sizeof(src0_t));
|
||||
|
||||
const auto [ir0, ir1] = get_thread_range(params, src0);
|
||||
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
|
||||
template <float (*op)(float)>
|
||||
static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
||||
apply_unary_op<op, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
||||
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
||||
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_abs>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sgn>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_neg>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_step>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_tanh>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_elu>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_relu>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sigmoid>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_hardsigmoid>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_exp>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_hardswish>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sqr>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sqrt>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sin>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_cos>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_log>(params, dst);
|
||||
}
|
||||
28
ggml/src/ggml-cpu/unary-ops.h
Normal file
28
ggml/src/ggml-cpu/unary-ops.h
Normal file
@@ -0,0 +1,28 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_tanh(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_elu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_hardsigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_exp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_hardswish(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sqr(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -288,6 +288,10 @@ static __device__ void no_device_code(
|
||||
__trap();
|
||||
|
||||
GGML_UNUSED(no_device_code); // suppress unused function warning
|
||||
|
||||
#if defined(GGML_USE_MUSA)
|
||||
__builtin_unreachable();
|
||||
#endif // defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
@@ -725,7 +729,13 @@ struct ggml_cuda_graph {
|
||||
bool disable_due_to_failed_graph_capture = false;
|
||||
int number_consecutive_updates = 0;
|
||||
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
||||
std::vector<char **> updated_kernel_arg;
|
||||
bool use_cpy_indirection = false;
|
||||
std::vector<char *> cpy_dest_ptrs;
|
||||
char ** dest_ptrs_d;
|
||||
int dest_ptrs_size = 0;
|
||||
// Index to allow each cpy kernel to be aware of it's position within the graph
|
||||
// relative to other cpy nodes.
|
||||
int graph_cpynode_index = -1;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.y < ne01) { // src0
|
||||
if (blockIdx.y < (unsigned)ne01) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
@@ -64,7 +64,7 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.z < ne02) { // src0
|
||||
if (blockIdx.z < (unsigned)ne02) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
|
||||
@@ -34,6 +34,10 @@ static __global__ void conv_transpose_1d_kernel(
|
||||
}
|
||||
}
|
||||
dst[global_index] = accumulator;
|
||||
GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);
|
||||
GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
|
||||
GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
|
||||
GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
|
||||
}
|
||||
|
||||
static void conv_transpose_1d_f32_f32_cuda(
|
||||
@@ -75,8 +79,6 @@ void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
const int p0 = 0;//opts[3];
|
||||
const int d0 = 1;//opts[4];
|
||||
|
||||
const int64_t kernel_size = ggml_nelements(src0);
|
||||
const int64_t input_size = ggml_nelements(src1);
|
||||
const int64_t output_size = ggml_nelements(dst);
|
||||
|
||||
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
|
||||
|
||||
@@ -577,7 +577,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
||||
return;
|
||||
}
|
||||
|
||||
const src_t * x = (src_t *) vx;
|
||||
const src_t * x = (const src_t *) vx;
|
||||
|
||||
y[i] = x[i];
|
||||
}
|
||||
|
||||
@@ -32,16 +32,18 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
|
||||
|
||||
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||
@@ -288,16 +290,18 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
|
||||
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
@@ -314,16 +318,18 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
|
||||
static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
|
||||
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
@@ -339,66 +345,84 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
// Copy destination pointers to GPU to be available when pointer indirection is in use
|
||||
|
||||
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
if (cuda_graph->dest_ptrs_d != nullptr) {
|
||||
CUDA_CHECK(cudaFree(cuda_graph->dest_ptrs_d));
|
||||
}
|
||||
CUDA_CHECK(cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *)));
|
||||
cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
|
||||
}
|
||||
// copy destination pointers to GPU
|
||||
CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
|
||||
cuda_graph->graph_cpynode_index = 0; // reset index
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int num_blocks = ne / QK8_0;
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q8_0_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int num_blocks = ne / QK4_0;
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_0_f32_cuda(
|
||||
@@ -407,22 +431,22 @@ static void ggml_cpy_q4_0_f32_cuda(
|
||||
const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12,
|
||||
const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
cudaStream_t stream) {
|
||||
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_1_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int num_blocks = ne / QK4_1;
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_1_f32_cuda(
|
||||
@@ -431,22 +455,22 @@ static void ggml_cpy_q4_1_f32_cuda(
|
||||
const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12,
|
||||
const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
cudaStream_t stream) {
|
||||
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q5_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
GGML_ASSERT(ne % QK5_0 == 0);
|
||||
const int num_blocks = ne / QK5_0;
|
||||
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_0_f32_cuda(
|
||||
@@ -455,22 +479,22 @@ static void ggml_cpy_q5_0_f32_cuda(
|
||||
const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12,
|
||||
const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
cudaStream_t stream) {
|
||||
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q5_1_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
GGML_ASSERT(ne % QK5_1 == 0);
|
||||
const int num_blocks = ne / QK5_1;
|
||||
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_1_f32_cuda(
|
||||
@@ -479,32 +503,32 @@ static void ggml_cpy_q5_1_f32_cuda(
|
||||
const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12,
|
||||
const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
cudaStream_t stream) {
|
||||
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_NL == 0);
|
||||
const int num_blocks = ne / QK4_NL;
|
||||
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
@@ -541,46 +565,60 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
char * src0_ddc = (char *) src0->data;
|
||||
char * src1_ddc = (char *) src1->data;
|
||||
|
||||
char ** dest_ptrs_d = nullptr;
|
||||
int graph_cpynode_index = -1;
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection) {
|
||||
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
||||
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
||||
}
|
||||
#endif
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
||||
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
||||
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
||||
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection) {
|
||||
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -7,3 +7,5 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
|
||||
|
||||
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
|
||||
|
||||
@@ -315,14 +315,14 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
|
||||
|
||||
float vals[sizeof(int)] = {0.0f};
|
||||
#pragma unroll
|
||||
for (int l = 0; l < sizeof(int); ++l) {
|
||||
for (int l = 0; l < int(sizeof(int)); ++l) {
|
||||
vals[l] = scale * x[4*threadIdx.x + l];
|
||||
}
|
||||
|
||||
float amax = fabsf(vals[0]);
|
||||
float sum = vals[0];
|
||||
#pragma unroll
|
||||
for (int l = 1; l < sizeof(int); ++l) {
|
||||
for (int l = 1; l < int(sizeof(int)); ++l) {
|
||||
amax = fmaxf(amax, fabsf(vals[l]));
|
||||
sum += vals[l];
|
||||
}
|
||||
@@ -338,7 +338,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
|
||||
|
||||
if (d != 0.0f) {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < sizeof(int); ++l) {
|
||||
for (int l = 0; l < int(sizeof(int)); ++l) {
|
||||
q8[l] = roundf(vals[l] / d);
|
||||
}
|
||||
}
|
||||
@@ -638,7 +638,7 @@ static __global__ void flash_attn_combine_results(
|
||||
float VKQ_denominator = 0.0f;
|
||||
for (int l = 0; l < parallel_blocks; ++l) {
|
||||
const float diff = meta[l].x - kqmax;
|
||||
const float KQ_max_scale = expf(diff);
|
||||
float KQ_max_scale = expf(diff);
|
||||
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
|
||||
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
|
||||
|
||||
@@ -649,6 +649,7 @@ static __global__ void flash_attn_combine_results(
|
||||
dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
static void on_no_fattn_vec_case(const int D) {
|
||||
if (D == 64) {
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
|
||||
|
||||
@@ -406,6 +406,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
#endif // CP_ASYNC_AVAILABLE
|
||||
|
||||
#else
|
||||
GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
|
||||
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV);
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
|
||||
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
|
||||
GGML_UNUSED(kb0);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
@@ -797,6 +806,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
__syncthreads();
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
|
||||
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
|
||||
GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask);
|
||||
GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
@@ -931,6 +946,16 @@ static __global__ void flash_attn_ext_f16(
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
||||
}
|
||||
@@ -985,38 +1010,38 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
|
||||
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
|
||||
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8)
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16)
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32)
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64);
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64)
|
||||
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64)
|
||||
|
||||
// Kernels with ncols == 128 are only 4% faster due to register pressure.
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128);
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128);
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128);
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128);
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128);
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128); // Needs too much shared memory.
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
|
||||
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory.
|
||||
|
||||
@@ -282,7 +282,19 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
}
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
|
||||
|
||||
@@ -281,6 +281,18 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -292,7 +292,19 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
|
||||
|
||||
@@ -277,6 +277,16 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -430,7 +430,17 @@ static __global__ void flash_attn_ext_f16(
|
||||
dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = dst_meta_val;
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
||||
}
|
||||
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
#include "ggml-cuda/rope.cuh"
|
||||
#include "ggml-cuda/scale.cuh"
|
||||
#include "ggml-cuda/softmax.cuh"
|
||||
#include "ggml-cuda/ssm-conv.cuh"
|
||||
#include "ggml-cuda/ssm-scan.cuh"
|
||||
#include "ggml-cuda/sum.cuh"
|
||||
#include "ggml-cuda/sumrows.cuh"
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
@@ -2296,6 +2298,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_SUM_ROWS:
|
||||
ggml_cuda_op_sum_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
ggml_cuda_op_ssm_conv(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SSM_SCAN:
|
||||
ggml_cuda_op_ssm_scan(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ARGSORT:
|
||||
ggml_cuda_op_argsort(ctx, dst);
|
||||
break;
|
||||
@@ -2433,10 +2441,11 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
|
||||
bool use_cuda_graph) {
|
||||
|
||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||
cuda_ctx->cuda_graph->updated_kernel_arg.clear();
|
||||
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
@@ -2468,8 +2477,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_CPY) {
|
||||
// store the copy op parameter which changes with each token.
|
||||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||
|
||||
// Store the pointers which are updated for each token, such that these can be sent
|
||||
// to the device and accessed using indirection from CUDA graph
|
||||
cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
|
||||
|
||||
// store a pointer to each copy op CUDA kernel to identify it later
|
||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
if (!ptr) {
|
||||
@@ -2477,10 +2489,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
||||
#endif
|
||||
} else {
|
||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2489,6 +2497,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
}
|
||||
}
|
||||
|
||||
if (use_cuda_graph) {
|
||||
cuda_ctx->cuda_graph->use_cpy_indirection = true;
|
||||
// copy pointers to GPU so they can be accessed via indirection within CUDA graph
|
||||
ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
|
||||
}
|
||||
|
||||
return use_cuda_graph;
|
||||
}
|
||||
|
||||
@@ -2543,51 +2557,6 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
||||
return true;
|
||||
}
|
||||
|
||||
static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
|
||||
|
||||
if (cuda_graph_update_required) {
|
||||
// Extract nodes from graph
|
||||
// First call with null argument gets number of nodes in graph
|
||||
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
||||
// Subsequent call with non-null argument gets nodes
|
||||
cuda_ctx->cuda_graph->nodes.clear();
|
||||
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||
cuda_ctx->cuda_graph->params.clear();
|
||||
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||
if (cuda_ctx->cuda_graph->num_nodes > 0) {
|
||||
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
|
||||
|
||||
// Loop over nodes, and extract kernel parameters from each node
|
||||
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
cudaGraphNodeType node_type;
|
||||
CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
|
||||
if (node_type == cudaGraphNodeTypeKernel) {
|
||||
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
|
||||
if (stat == cudaErrorInvalidDeviceFunction) {
|
||||
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
||||
// We don't need to update blas nodes, so clear error and move on.
|
||||
(void)cudaGetLastError();
|
||||
} else {
|
||||
GGML_ASSERT(stat == cudaSuccess);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// One of the arguments to the copy kernel is updated for each token, hence we need to
|
||||
// replace that argument with the updated value in the CUDA graph
|
||||
// on update steps, the live parameters will already be captured
|
||||
int k = 0;
|
||||
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
||||
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||
*(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
|
||||
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
||||
|
||||
bool cuda_graph_update_required = false;
|
||||
@@ -2647,8 +2616,7 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
#endif
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
[[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool & graph_evaluated_or_captured, bool & use_cuda_graph,
|
||||
bool & cuda_graph_update_required) {
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
|
||||
while (!graph_evaluated_or_captured) {
|
||||
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
||||
@@ -2698,13 +2666,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
|
||||
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
||||
}
|
||||
|
||||
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
|
||||
maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
|
||||
|
||||
// Update graph executable
|
||||
update_cuda_graph_executable(cuda_ctx);
|
||||
|
||||
if (cuda_graph_update_required) { // Update graph executable
|
||||
update_cuda_graph_executable(cuda_ctx);
|
||||
}
|
||||
// Launch graph
|
||||
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
||||
#else
|
||||
@@ -2718,10 +2682,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
// vector of pointers to CUDA cpy kernels, which are required to identify
|
||||
// kernel parameters which need updated in the graph for each token
|
||||
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
||||
|
||||
@@ -2755,8 +2715,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
if (use_cuda_graph) {
|
||||
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||
|
||||
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
|
||||
ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
|
||||
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
|
||||
|
||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
@@ -2777,6 +2736,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
||||
}
|
||||
|
||||
if (!use_cuda_graph) {
|
||||
cuda_ctx->cuda_graph->use_cpy_indirection = false;
|
||||
}
|
||||
|
||||
#else
|
||||
bool use_cuda_graph = false;
|
||||
bool cuda_graph_update_required = false;
|
||||
@@ -2784,7 +2747,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
|
||||
bool graph_evaluated_or_captured = false;
|
||||
|
||||
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
||||
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -3193,6 +3156,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_SSM_SCAN:
|
||||
case GGML_OP_SSM_CONV:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
@@ -3232,6 +3197,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
#ifndef FLASH_ATTN_AVAILABLE
|
||||
return false;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
||||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@ static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
|
||||
asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;"
|
||||
: "=r"(ret) : "r"(x));
|
||||
#else
|
||||
GGML_UNUSED(x);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(NEW_MMA_AVAILABLE)
|
||||
return ret;
|
||||
@@ -178,6 +179,7 @@ namespace ggml_cuda_mma {
|
||||
: "l"(xs));
|
||||
#else
|
||||
load_generic(xs0, stride);
|
||||
GGML_UNUSED(t);
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
|
||||
@@ -945,7 +945,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
@@ -1024,7 +1024,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int k01 = 0; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
|
||||
for (int k01 = 0; k01 < WARP_SIZE/2; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
#pragma unroll
|
||||
@@ -1035,19 +1035,34 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
if (k01 < WARP_SIZE/2) {
|
||||
constexpr int ns = 2;
|
||||
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
||||
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
||||
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
||||
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
||||
} else {
|
||||
constexpr int ns = 1;
|
||||
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
||||
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
||||
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
||||
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
||||
}
|
||||
constexpr int ns = 2;
|
||||
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
||||
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
||||
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
||||
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Some compilers fail to unroll the loop over k01 if there is a conditional statement for ns in the inner loop.
|
||||
// As a workaround 2 separate loops are used instead.
|
||||
#pragma unroll
|
||||
for (int k01 = WARP_SIZE/2; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
constexpr int ns = 1;
|
||||
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
||||
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
||||
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
||||
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1176,7 +1191,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
@@ -1253,7 +1268,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
const float d = bxi->d;
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < sizeof(int); ++l) {
|
||||
for (int l = 0; l < int(sizeof(int)); ++l) {
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
|
||||
}
|
||||
#else
|
||||
@@ -1376,7 +1391,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < sizeof(int); ++l) {
|
||||
for (int l = 0; l < int(sizeof(int)); ++l) {
|
||||
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
|
||||
}
|
||||
}
|
||||
@@ -1517,7 +1532,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < sizeof(int); ++l) {
|
||||
for (int l = 0; l < int(sizeof(int)); ++l) {
|
||||
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
|
||||
}
|
||||
}
|
||||
@@ -1810,7 +1825,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
@@ -2570,6 +2585,8 @@ static __device__ void mul_mat_q_process_tile(
|
||||
} else {
|
||||
write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
|
||||
}
|
||||
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne10);
|
||||
}
|
||||
|
||||
|
||||
@@ -2695,7 +2712,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
|
||||
|
||||
// Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
|
||||
if (it != blockIdx.x || jt != blockIdx.y) {
|
||||
if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -2825,7 +2842,6 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
template <ggml_type type>
|
||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ static __global__ void mul_mat_vec(
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
float sumf;
|
||||
float sumf = 0.0f;
|
||||
|
||||
if constexpr (std::is_same<T, half>::value) {
|
||||
const half2 * x2 = (const half2 *) x;
|
||||
|
||||
@@ -151,7 +151,7 @@ static __global__ void mul_mat_vec_q(
|
||||
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
|
||||
|
||||
// partial sum for each thread
|
||||
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
||||
float tmp[ncols_y][rows_per_cuda_block] = {{0.0f}};
|
||||
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
|
||||
@@ -197,10 +197,12 @@ static __global__ void mul_mat_vec_q(
|
||||
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
|
||||
}
|
||||
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < (unsigned)nrows_dst)) {
|
||||
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
GGML_UNUSED(nrows_x);
|
||||
}
|
||||
|
||||
static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||
|
||||
@@ -14,7 +14,7 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
||||
if (nidx < ne00 && blockIdx.y < (unsigned)ne01 && blockIdx.z < (unsigned)(ne02*ne03)) {
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne00 +
|
||||
|
||||
151
ggml/src/ggml-cuda/ssm-conv.cu
Normal file
151
ggml/src/ggml-cuda/ssm-conv.cu
Normal file
@@ -0,0 +1,151 @@
|
||||
#include "ssm-conv.cuh"
|
||||
|
||||
template <size_t split_d_inner, size_t d_conv>
|
||||
static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1,
|
||||
const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
|
||||
float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
|
||||
const int nc, const int ncs, const int nr, const int n_t, const int n_s) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bidx = blockIdx.x;
|
||||
const int bidy = blockIdx.y;
|
||||
|
||||
const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1);
|
||||
const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
|
||||
float * y_block = (float *) ((char *) dst + bidx * dst_nb2 + bidy * split_d_inner * dst_nb0);
|
||||
|
||||
const int stride_x = src0_nb1 / sizeof(float);
|
||||
const int stride_w = src1_nb1 / sizeof(float);
|
||||
const int stride_y = dst_nb1 / sizeof(float);
|
||||
|
||||
float x[d_conv] = { 0.0f };
|
||||
float w[d_conv] = { 0.0f };
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
w[j] = w_block[tid * stride_w + j];
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_t; i++) {
|
||||
float sumf = 0.0f;
|
||||
|
||||
if (i == 0) {
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
x[j] = x_block[tid * stride_x + j];
|
||||
}
|
||||
} else {
|
||||
x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
sumf += x[(i + j) % d_conv] * w[j];
|
||||
}
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t split_d_inner, size_t d_conv, size_t split_n_t>
|
||||
static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const float * __restrict__ src1,
|
||||
const int src0_nb0, const int src0_nb1, const int src0_nb2,
|
||||
const int src1_nb1, float * __restrict__ dst, const int dst_nb0,
|
||||
const int dst_nb1, const int dst_nb2, const int nc, const int ncs,
|
||||
const int nr, const int n_t, const int n_s) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bidx = blockIdx.x;
|
||||
const int bidy = blockIdx.y;
|
||||
const int bidz = blockIdx.z;
|
||||
|
||||
const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1 +
|
||||
bidz * split_n_t * src0_nb0);
|
||||
const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
|
||||
float * y_block =
|
||||
(float *) ((char *) dst + bidx * dst_nb2 + bidz * split_n_t * dst_nb1 + bidy * split_d_inner * dst_nb0);
|
||||
|
||||
const int stride_x = src0_nb1 / sizeof(float);
|
||||
const int stride_w = src1_nb1 / sizeof(float);
|
||||
const int stride_y = dst_nb1 / sizeof(float);
|
||||
|
||||
float x[d_conv] = { 0.0f };
|
||||
float w[d_conv] = { 0.0f };
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
w[j] = w_block[tid * stride_w + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < split_n_t; i++) {
|
||||
if (bidz * split_n_t + i < n_t) {
|
||||
float sumf = 0.0f;
|
||||
|
||||
if (i == 0) {
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
x[j] = x_block[tid * stride_x + j];
|
||||
}
|
||||
} else {
|
||||
x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
sumf += x[(i + j) % d_conv] * w[j];
|
||||
}
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1,
|
||||
const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
|
||||
const int dst_nb2, const int nc, const int ncs, const int nr, const int n_t,
|
||||
const int n_s, cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
GGML_ASSERT(nr % threads == 0);
|
||||
|
||||
if (n_t <= 32) {
|
||||
const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
|
||||
if (nc == 4) {
|
||||
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
||||
dst, dst_nb0, dst_nb1, dst_nb2, nc, ncs, nr, n_t,
|
||||
n_s);
|
||||
} else {
|
||||
GGML_ABORT("Only support kernel size = 4 now.");
|
||||
}
|
||||
} else {
|
||||
if (nc == 4) {
|
||||
const int split_n_t = 32;
|
||||
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
||||
ssm_conv_long_token_f32<threads, 4, split_n_t>
|
||||
<<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0,
|
||||
dst_nb1, dst_nb2, nc, ncs, nr, n_t, n_s);
|
||||
} else {
|
||||
GGML_ABORT("Only support kernel size = 4 right now.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // conv_x
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
|
||||
|
||||
const int nc = src1->ne[0]; // d_conv
|
||||
const int ncs = src0->ne[0]; // d_conv - 1 + n_t
|
||||
const int nr = src0->ne[1]; // d_inner
|
||||
const int n_t = dst->ne[1]; // tokens per sequence
|
||||
const int n_s = dst->ne[2]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nr);
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
ssm_conv_f32_cuda(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, dst->nb[0], dst->nb[1],
|
||||
dst->nb[2], nc, ncs, nr, n_t, n_s, stream);
|
||||
}
|
||||
3
ggml/src/ggml-cuda/ssm-conv.cuh
Normal file
3
ggml/src/ggml-cuda/ssm-conv.cuh
Normal file
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
155
ggml/src/ggml-cuda/ssm-scan.cu
Normal file
155
ggml/src/ggml-cuda/ssm-scan.cu
Normal file
@@ -0,0 +1,155 @@
|
||||
#include "ssm-scan.cuh"
|
||||
|
||||
// #include <cuda_runtime.h>
|
||||
// static __device__ void global_to_shared(const float *src, float *dst) {
|
||||
// asm volatile("cp.async.");
|
||||
// }
|
||||
|
||||
template <size_t splitD, size_t N>
|
||||
__global__ void __launch_bounds__(splitD, 2)
|
||||
ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
|
||||
const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
|
||||
const int src0_nb1, const int src0_nb2, const int src1_nb0, const int src1_nb1, const int src1_nb2,
|
||||
const int src1_nb3, const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
|
||||
float * __restrict__ dst, const int D, const int L, const int B) {
|
||||
const int bidx = blockIdx.x; // split along B
|
||||
const int bidy = blockIdx.y; // split along D
|
||||
const int tid = threadIdx.x;
|
||||
const int wid = tid / 32;
|
||||
const int wtid = tid % 32;
|
||||
|
||||
extern __shared__ float smem[];
|
||||
const int stride_sA = N + 1;
|
||||
const int stride_ss0 = N + 1;
|
||||
float * smem_A = smem;
|
||||
float * smem_s0 = smem_A + splitD * stride_sA;
|
||||
|
||||
const float * s0_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
|
||||
const float * x_block = (const float *) ((char *) src1 + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
|
||||
const float * dt_block = (const float *) ((char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
|
||||
const float * A_block = (const float *) ((char *) src3 + bidy * splitD * src3_nb1);
|
||||
const float * B_block = (const float *) ((char *) src4 + (bidx * src4_nb2));
|
||||
const float * C_block = (const float *) ((char *) src5 + (bidx * src5_nb2));
|
||||
float * y_block = (float *) ((char *) dst + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
|
||||
float * s_block = (float *) ((char *) dst + src1_nb3 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
|
||||
|
||||
const int stride_s0 = src0_nb1 / sizeof(float);
|
||||
const int stride_x = src1_nb1 / sizeof(float);
|
||||
const int stride_dt = src2_nb1 / sizeof(float);
|
||||
const int stride_A = src3_nb1 / sizeof(float);
|
||||
const int stride_B = src4_nb1 / sizeof(float);
|
||||
const int stride_C = src5_nb1 / sizeof(float);
|
||||
const int stride_s = stride_s0;
|
||||
const int stride_y = stride_x;
|
||||
|
||||
// can N not be 16? for example 32?
|
||||
if (N == 16) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < splitD / 4; i += 2) {
|
||||
float value = A_block[(wid * warpSize + i) * stride_A + wtid];
|
||||
// todo: bank conflict
|
||||
// I am always confused with how to use the swizzling method to solve
|
||||
// bank conflit. Hoping somebody can tell me.
|
||||
smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < splitD / 4; i += 2) {
|
||||
float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 0; i < L; i++) {
|
||||
float dt_soft_plus = dt_block[i * stride_dt + tid];
|
||||
if (dt_soft_plus <= 20.0f) {
|
||||
dt_soft_plus = log1pf(exp(dt_soft_plus));
|
||||
}
|
||||
float x_dt = x_block[i * stride_x + tid] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < N; j++) {
|
||||
float state = (smem_s0[tid * stride_ss0 + j] * expf(dt_soft_plus * smem_A[tid * stride_sA + j])) +
|
||||
(B_block[i * stride_B + j] * x_dt);
|
||||
sumf += state * C_block[i * stride_C + j];
|
||||
if (i == L - 1) {
|
||||
s_block[tid * stride_s + j] = state;
|
||||
} else {
|
||||
smem_s0[tid * stride_ss0 + j] = state;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
static void ssm_scan_f32_cuda(const float * src0, const float * src1, const float * src2, const float * src3,
|
||||
const float * src4, const float * src5, const int src0_nb1, const int src0_nb2,
|
||||
const int src1_nb0, const int src1_nb1, const int src1_nb2, const int src1_nb3,
|
||||
const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
|
||||
float * dst, const int N, const int D, const int L, const int B, cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
// todo: consider D cannot be divided,does this situation exist?
|
||||
GGML_ASSERT(D % threads == 0);
|
||||
const dim3 blocks(B, (D + threads - 1) / threads, 1);
|
||||
const int smem_size = (threads * (N + 1) * 2) * sizeof(float);
|
||||
if (N == 16) {
|
||||
ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src0_nb1, src0_nb2, src1_nb0, src1_nb1, src1_nb2, src1_nb3, src2_nb0,
|
||||
src2_nb1, src2_nb2, src3_nb1, src4_nb1, src4_nb2, src5_nb1, src5_nb2, dst, D, L, B);
|
||||
} else {
|
||||
GGML_ABORT("doesn't support N!=16.");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // s
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // x
|
||||
const struct ggml_tensor * src2 = dst->src[2]; // dt
|
||||
const struct ggml_tensor * src3 = dst->src[3]; // A
|
||||
const struct ggml_tensor * src4 = dst->src[4]; // B
|
||||
const struct ggml_tensor * src5 = dst->src[5]; // C
|
||||
|
||||
// const int64_t d_state = src0->ne[0];
|
||||
// const int64_t d_inner = src0->ne[1];
|
||||
// const int64_t l = src1->ne[1];
|
||||
// const int64_t b = src0->ne[2];
|
||||
|
||||
const int64_t nc = src0->ne[0]; // d_state
|
||||
const int64_t nr = src0->ne[1]; // d_inner
|
||||
const int64_t n_t = src1->ne[1]; // number of tokens per sequence
|
||||
const int64_t n_s = src0->ne[2]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
||||
// required for the dot product between s and C
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
|
||||
// required for per-sequence offsets for states
|
||||
GGML_ASSERT(src0->nb[2] == src0->ne[0] * src0->ne[1] * sizeof(float));
|
||||
// required to get correct offset for state destination (i.e. src1->nb[3])
|
||||
GGML_ASSERT(src1->nb[3] == src1->ne[0] * src1->ne[1] * src1->ne[2] * sizeof(float));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
const float * src2_d = (const float *) src2->data;
|
||||
const float * src3_d = (const float *) src3->data;
|
||||
const float * src4_d = (const float *) src4->data;
|
||||
const float * src5_d = (const float *) src5->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ssm_scan_f32_cuda(src0_d, src1_d, src2_d, src3_d, src4_d, src5_d, src0->nb[1], src0->nb[2], src1->nb[0],
|
||||
src1->nb[1], src1->nb[2], src1->nb[3], src2->nb[0], src2->nb[1], src2->nb[2], src3->nb[1],
|
||||
src4->nb[1], src4->nb[2], src5->nb[1], src5->nb[2], dst_d, nc, nr, n_t, n_s, stream);
|
||||
}
|
||||
3
ggml/src/ggml-cuda/ssm-scan.cuh
Normal file
3
ggml/src/ggml-cuda/ssm-scan.cuh
Normal file
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -19,7 +19,7 @@ static __global__ void upscale_f32(const float * x, float * dst,
|
||||
int i02 = i12 / sf2;
|
||||
int i03 = i13 / sf3;
|
||||
|
||||
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
||||
dst[index] = *( (const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00) );
|
||||
}
|
||||
|
||||
static void upscale_f32_cuda(const float * x, float * dst,
|
||||
|
||||
@@ -219,9 +219,12 @@ typedef struct {
|
||||
int32_t ne11;
|
||||
int32_t ne_12_2; // assume K and V are same shape
|
||||
int32_t ne_12_3;
|
||||
uint64_t nb_12_1;
|
||||
uint64_t nb_12_2;
|
||||
uint64_t nb_12_3;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
uint64_t nb21;
|
||||
uint64_t nb22;
|
||||
uint64_t nb23;
|
||||
uint64_t nb31;
|
||||
int32_t ne1;
|
||||
int32_t ne2;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -48,7 +48,7 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_f16_t4(device const half4 * src, short il, thread type4 & reg) {
|
||||
reg = (type4)(*(src + il));
|
||||
reg = (type4)(*(src));
|
||||
}
|
||||
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
@@ -56,6 +56,11 @@ template <typename type4x4>
|
||||
void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) {
|
||||
reg = (type4x4)(*src);
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_bf16_t4(device const bfloat4 * src, short il, thread type4 & reg) {
|
||||
reg = (type4)(*(src));
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename type4x4>
|
||||
@@ -3100,7 +3105,8 @@ template<
|
||||
typename vd4x4_t, // key type in device memory
|
||||
short nl_v,
|
||||
void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
|
||||
short D, // head size
|
||||
short DK, // K head size
|
||||
short DV, // V head size
|
||||
short Q = 8, // queries per threadgroup
|
||||
short KV = 8, // key/value processed per each simdgroup
|
||||
short C = 32> // cache items per threadgroup
|
||||
@@ -3122,20 +3128,24 @@ kernel void kernel_flash_attn_ext(
|
||||
const int iq2 = tgpig[1];
|
||||
const int iq1 = tgpig[0]*Q;
|
||||
|
||||
const short D4 = D/4;
|
||||
const short D8 = D/8;
|
||||
const short D16 = D/16;
|
||||
const short NW = N_SIMDWIDTH;
|
||||
const short SH = (2*C + Q); // shared memory per simdgroup (s_t == float)
|
||||
constexpr short DK4 = DK/4;
|
||||
constexpr short DK8 = DK/8;
|
||||
constexpr short DK16 = DK/16;
|
||||
constexpr short DV4 = DV/4;
|
||||
constexpr short DV8 = DV/8;
|
||||
constexpr short DV16 = DV/16;
|
||||
|
||||
constexpr short NW = N_SIMDWIDTH;
|
||||
constexpr short SH = (2*C + Q); // shared memory per simdgroup (s_t == float)
|
||||
|
||||
const short TS = nsg*SH; // shared memory size per query in (s_t == float)
|
||||
const short T = D + 2*TS; // shared memory size per query in (half)
|
||||
const short T = DK + 2*TS; // shared memory size per query in (half)
|
||||
|
||||
threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*D); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*D); // same as above but in q4_t
|
||||
threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*D); // reuse query data for accumulation
|
||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*D); // same as above but in o4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + Q*D); // scratch buffer for attention, mask and diagonal matrix
|
||||
threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*DK); // reuse query data for accumulation
|
||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*DK); // same as above but in o4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + Q*DK); // scratch buffer for attention, mask and diagonal matrix
|
||||
|
||||
threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
|
||||
threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // same as above but in k4x4_t
|
||||
@@ -3144,23 +3154,23 @@ kernel void kernel_flash_attn_ext(
|
||||
threadgroup v4x4_t * sv4x4 = (threadgroup v4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // same as above but in v4x4_t
|
||||
|
||||
// store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
|
||||
o8x8_t lo[D8];
|
||||
o8x8_t lo[DV8];
|
||||
|
||||
// load heads from Q to shared memory
|
||||
for (short j = sgitg; j < Q; j += nsg) {
|
||||
device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*args.nb01 + iq2*args.nb02 + iq3*args.nb03));
|
||||
|
||||
for (short i = tiisg; i < D4; i += NW) {
|
||||
for (short i = tiisg; i < DK4; i += NW) {
|
||||
if (iq1 + j < args.ne01) {
|
||||
sq4[j*D4 + i] = (q4_t) q4[i];
|
||||
sq4[j*DK4 + i] = (q4_t) q4[i];
|
||||
} else {
|
||||
sq4[j*D4 + i] = (q4_t) 0.0f;
|
||||
sq4[j*DK4 + i] = (q4_t) 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// zero out lo
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
lo[i] = make_filled_simdgroup_matrix<o_t, 8>((o_t) 0.0f);
|
||||
}
|
||||
|
||||
@@ -3174,8 +3184,8 @@ kernel void kernel_flash_attn_ext(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
{
|
||||
half S[Q] = { [0 ... Q-1] = 0.0f };
|
||||
half M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
|
||||
float S[Q] = { [0 ... Q-1] = 0.0f };
|
||||
float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
|
||||
|
||||
// thread indices inside the simdgroup
|
||||
// TODO: see if we can utilize quad-group functions for better performance
|
||||
@@ -3190,22 +3200,15 @@ kernel void kernel_flash_attn_ext(
|
||||
const short ikv2 = iq2/(args.ne02/args.ne_12_2);
|
||||
const short ikv3 = iq3/(args.ne03/args.ne_12_3);
|
||||
|
||||
// load the queries from shared memory into local memory
|
||||
q8x8_t mq[D8];
|
||||
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
simdgroup_load(mq[i], sq + i*8, D);
|
||||
}
|
||||
|
||||
const bool has_mask = mask != q;
|
||||
|
||||
half slope = 1.0f;
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (args.max_bias > 0.0f) {
|
||||
const short h = iq2;
|
||||
|
||||
const half base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const float base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, exph);
|
||||
@@ -3221,14 +3224,14 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
if (has_mask) {
|
||||
// used to detect blocks full of -INF
|
||||
half smax = -INFINITY;
|
||||
float smax = -INFINITY;
|
||||
|
||||
// load the mask in shared memory
|
||||
#pragma unroll(Q)
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31);
|
||||
|
||||
const half m = pm[ic + tiisg];
|
||||
const float m = pm[ic + tiisg];
|
||||
|
||||
ss[j*TS + C + tiisg] = m;
|
||||
smax = max(smax, m);
|
||||
@@ -3249,20 +3252,22 @@ kernel void kernel_flash_attn_ext(
|
||||
// this is compile-time check, so it does not have runtime overhead
|
||||
if (is_same<kd4x4_t, k4x4_t>::value) {
|
||||
// we can read directly from global memory
|
||||
device const k_t * pk = (device const k_t *) ((device const char *) k + ((ic + 8*cc)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
|
||||
device const k_t * pk = (device const k_t *) ((device const char *) k + ((ic + 8*cc)*args.nb11 + ikv2*args.nb12 + ikv3*args.nb13));
|
||||
|
||||
#pragma unroll(D8)
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
#pragma unroll(DK8)
|
||||
for (short i = 0; i < DK8; ++i) {
|
||||
k8x8_t mk;
|
||||
simdgroup_load(mk, pk + i*8, args.nb_12_1/sizeof(k_t), 0, true); // transpose // TODO: use ne10
|
||||
simdgroup_load(mk, pk + i*8, args.nb11/sizeof(k_t), 0, true); // transpose // TODO: use ne10
|
||||
|
||||
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
|
||||
q8x8_t mq;
|
||||
simdgroup_load(mq, sq + i*8, DK);
|
||||
simdgroup_multiply_accumulate(mqk, mq, mk, mqk);
|
||||
}
|
||||
} else {
|
||||
for (short ii = 0; ii < D16; ii += 4) {
|
||||
device const kd4x4_t * pk4x4 = (device const kd4x4_t *) ((device const char *) k + ((ic + 8*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
|
||||
for (short ii = 0; ii < DK16; ii += 4) {
|
||||
device const kd4x4_t * pk4x4 = (device const kd4x4_t *) ((device const char *) k + ((ic + 8*cc + ty)*args.nb11 + ikv2*args.nb12 + ikv3*args.nb13));
|
||||
|
||||
if (D16%4 == 0) {
|
||||
if (DK16%4 == 0) {
|
||||
// the head is evenly divisible by 4*16 = 64, so no need for bound checks
|
||||
{
|
||||
k4x4_t tmp;
|
||||
@@ -3275,15 +3280,18 @@ kernel void kernel_flash_attn_ext(
|
||||
#pragma unroll(4)
|
||||
for (short k = 0; k < 4; ++k) {
|
||||
k8x8_t mk;
|
||||
q8x8_t mq;
|
||||
|
||||
simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose
|
||||
simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk);
|
||||
simdgroup_load(mq, sq + (2*(ii + k) + 0)*8, DK);
|
||||
simdgroup_multiply_accumulate(mqk, mq, mk, mqk);
|
||||
|
||||
simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose
|
||||
simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk);
|
||||
simdgroup_load(mq, sq + (2*(ii + k) + 1)*8, DK);
|
||||
simdgroup_multiply_accumulate(mqk, mq, mk, mqk);
|
||||
}
|
||||
} else {
|
||||
if (ii + tx < D16) {
|
||||
if (ii + tx < DK16) {
|
||||
k4x4_t tmp;
|
||||
deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp);
|
||||
sk4x4[4*ty + tx] = tmp;
|
||||
@@ -3291,14 +3299,17 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (short k = 0; k < 4 && ii + k < D16; ++k) {
|
||||
for (short k = 0; k < 4 && ii + k < DK16; ++k) {
|
||||
k8x8_t mk;
|
||||
q8x8_t mq;
|
||||
|
||||
simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose
|
||||
simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk);
|
||||
simdgroup_load(mq, sq + (2*(ii + k) + 0)*8, DK);
|
||||
simdgroup_multiply_accumulate(mqk, mq, mk, mqk);
|
||||
|
||||
simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose
|
||||
simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk);
|
||||
simdgroup_load(mq, sq + (2*(ii + k) + 1)*8, DK);
|
||||
simdgroup_multiply_accumulate(mqk, mq, mk, mqk);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3316,10 +3327,10 @@ kernel void kernel_flash_attn_ext(
|
||||
// online softmax
|
||||
{
|
||||
for (ushort j = 0; j < Q; ++j) {
|
||||
const half m = M[j];
|
||||
const float m = M[j];
|
||||
|
||||
// scale and apply the logitcap / mask
|
||||
half s = ss[j*TS + tiisg]*args.scale;
|
||||
float s = ss[j*TS + tiisg]*args.scale;
|
||||
|
||||
if (args.logit_softcap != 0.0f) {
|
||||
s = args.logit_softcap*precise::tanh(s);
|
||||
@@ -3330,8 +3341,8 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
M[j] = simd_max(max(M[j], s));
|
||||
|
||||
const half ms = exp(m - M[j]);
|
||||
const half vs = exp(s - M[j]);
|
||||
const float ms = exp(m - M[j]);
|
||||
const float vs = exp(s - M[j]);
|
||||
|
||||
S[j] = S[j]*ms + simd_sum(vs);
|
||||
|
||||
@@ -3350,8 +3361,8 @@ kernel void kernel_flash_attn_ext(
|
||||
s8x8_t mm;
|
||||
simdgroup_load(mm, ss + 2*C, TS, 0, false);
|
||||
|
||||
#pragma unroll(D8)
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
#pragma unroll(DV8)
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_multiply(lo[i], mm, lo[i]);
|
||||
}
|
||||
}
|
||||
@@ -3364,20 +3375,20 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
if (is_same<vd4x4_t, v4x4_t>::value) {
|
||||
// we can read directly from global memory
|
||||
device const v_t * pv = (device const v_t *) ((device const char *) v + ((ic + 8*cc)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
|
||||
device const v_t * pv = (device const v_t *) ((device const char *) v + ((ic + 8*cc)*args.nb21 + ikv2*args.nb22 + ikv3*args.nb23));
|
||||
|
||||
#pragma unroll(D8)
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
#pragma unroll(DV8)
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
v8x8_t mv;
|
||||
simdgroup_load(mv, pv + i*8, args.nb_12_1/sizeof(v_t), 0, false); // TODO: use ne20
|
||||
simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20
|
||||
|
||||
simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
|
||||
}
|
||||
} else {
|
||||
for (short ii = 0; ii < D16; ii += 4) {
|
||||
device const vd4x4_t * pv4x4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 8*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
|
||||
for (short ii = 0; ii < DV16; ii += 4) {
|
||||
device const vd4x4_t * pv4x4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 8*cc + ty)*args.nb21 + ikv2*args.nb22 + ikv3*args.nb23));
|
||||
|
||||
if (D16%4 == 0) {
|
||||
if (DV16%4 == 0) {
|
||||
// no need for bound checks
|
||||
{
|
||||
v4x4_t tmp;
|
||||
@@ -3398,7 +3409,7 @@ kernel void kernel_flash_attn_ext(
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
|
||||
}
|
||||
} else {
|
||||
if (ii + tx < D16) {
|
||||
if (ii + tx < DV16) {
|
||||
v4x4_t tmp;
|
||||
deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp);
|
||||
sv4x4[4*ty + tx] = tmp;
|
||||
@@ -3406,7 +3417,7 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (short k = 0; k < 4 && ii + k < D16; ++k) {
|
||||
for (short k = 0; k < 4 && ii + k < DV16; ++k) {
|
||||
v8x8_t mv;
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
||||
@@ -3433,15 +3444,15 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
// reduce the warps sequentially
|
||||
for (ushort sg = 1; sg < nsg; ++sg) {
|
||||
half S = { 0.0f };
|
||||
half M = { -__FLT16_MAX__/2 };
|
||||
float S = { 0.0f };
|
||||
float M = { -__FLT16_MAX__/2 };
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// each simdgroup stores its output to shared memory, reusing sq
|
||||
if (sgitg == sg) {
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, D, 0, false);
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3450,16 +3461,16 @@ kernel void kernel_flash_attn_ext(
|
||||
// the first simdgroup accumulates the results from the other simdgroups
|
||||
if (sgitg == 0) {
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
const half S0 = ss[j*TS + 0];
|
||||
const half S1 = ss[j*TS + sg*SH + 0];
|
||||
const float S0 = ss[j*TS + 0];
|
||||
const float S1 = ss[j*TS + sg*SH + 0];
|
||||
|
||||
const half M0 = ss[j*TS + 1];
|
||||
const half M1 = ss[j*TS + sg*SH + 1];
|
||||
const float M0 = ss[j*TS + 1];
|
||||
const float M1 = ss[j*TS + sg*SH + 1];
|
||||
|
||||
M = max(M0, M1);
|
||||
|
||||
const half ms0 = exp(M0 - M);
|
||||
const half ms1 = exp(M1 - M);
|
||||
const float ms0 = exp(M0 - M);
|
||||
const float ms1 = exp(M1 - M);
|
||||
|
||||
S = S0*ms0 + S1*ms1;
|
||||
|
||||
@@ -3480,11 +3491,11 @@ kernel void kernel_flash_attn_ext(
|
||||
simdgroup_load(ms0, ss + 2*C, TS, 0, false);
|
||||
simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
|
||||
|
||||
#pragma unroll(D8)
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
#pragma unroll(DV8)
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
o8x8_t t;
|
||||
|
||||
simdgroup_load (t, so + i*8, D, 0, false);
|
||||
simdgroup_load (t, so + i*8, DV, 0, false);
|
||||
simdgroup_multiply(t, ms1, t);
|
||||
|
||||
simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
|
||||
@@ -3495,8 +3506,8 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
// store result to shared memory (reuse sq)
|
||||
if (sgitg == 0) {
|
||||
for (short i = 0; i < D8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, D, 0, false);
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3507,8 +3518,8 @@ kernel void kernel_flash_attn_ext(
|
||||
for (short j = 0; j < Q && iq1 + j < args.ne01; ++j) {
|
||||
const float S = ss[j*TS + 0];
|
||||
|
||||
for (short i = tiisg; i < D4; i += NW) {
|
||||
dst4[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*D4 + i] = (float4) so4[j*D4 + i]/S;
|
||||
for (short i = tiisg; i < DV4; i += NW) {
|
||||
dst4[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*DV4 + i] = (float4) so4[j*DV4 + i]/S;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3525,80 +3536,94 @@ kernel void kernel_flash_attn_ext(
|
||||
float, simdgroup_float8x8, \
|
||||
half, half4, simdgroup_half8x8
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64>) flash_attn_ext_t;
|
||||
typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 80, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 96, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 112, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 128, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256, 256>;
|
||||
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 80, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 96, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 112, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 128, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
|
||||
#endif
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 80, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 96, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 112, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 128, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256, 256>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 80, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 96, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 112, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 128, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256, 256>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 80, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 96, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 112, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 128, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256, 256>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 80, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 96, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 112, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 128, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256, 256>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 80, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 96, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 112, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 128, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256, 256>;
|
||||
|
||||
#undef FA_TYPES
|
||||
|
||||
template<
|
||||
typename q4_t, // query types in shared memory
|
||||
typename q4x4_t,
|
||||
typename k4x4_t, // key types in shared memory
|
||||
typename v4x4_t, // value types in shared memory
|
||||
typename qk_t, // Q*K types
|
||||
typename s_t, // soft-max types
|
||||
typename q4_t, // query types in shared memory
|
||||
typename k4_t, // key types in shared memory
|
||||
typename v4_t, // value types in shared memory
|
||||
typename qk_t, // Q*K types
|
||||
typename s_t, // soft-max types
|
||||
typename s4_t,
|
||||
typename s4x4_t,
|
||||
typename o4x4_t, // attention accumulation types
|
||||
typename kd4x4_t, // key type in device memory
|
||||
typename o4_t, // attention accumulation types
|
||||
typename kd4_t, // key type in device memory
|
||||
short nl_k,
|
||||
void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
|
||||
typename vd4x4_t, // key type in device memory
|
||||
void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
|
||||
typename vd4_t, // key type in device memory
|
||||
short nl_v,
|
||||
void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
|
||||
short D, // head size
|
||||
short Q = 1, // queries per threadgroup
|
||||
short C = 32> // cache items per threadgroup
|
||||
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
|
||||
short DK, // K head size
|
||||
short DV, // V head size
|
||||
short NE = 4, // head elements per thread
|
||||
short Q = 1, // queries per threadgroup
|
||||
short C = 32> // cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext_vec(
|
||||
constant ggml_metal_kargs_flash_attn_ext & args,
|
||||
device const char * q,
|
||||
@@ -3617,29 +3642,28 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
const int iq2 = tgpig[1];
|
||||
const int iq1 = tgpig[0];
|
||||
|
||||
const short D4 = D/4;
|
||||
const short D16 = D/16;
|
||||
const short NW = N_SIMDWIDTH;
|
||||
const short NL = NW/4; // note: this can be adjusted to support D%64 == 0 and D%32 == 0
|
||||
const short SH = 2*C; // shared memory per simdgroup
|
||||
constexpr short DK4 = DK/4;
|
||||
constexpr short DV4 = DV/4;
|
||||
constexpr short NW = N_SIMDWIDTH;
|
||||
constexpr short NL = NW/NE; // note: this can be adjusted to support different head sizes and simdgroup work loads
|
||||
constexpr short SH = 4*C; // shared memory per simdgroup
|
||||
|
||||
const short T = D + nsg*SH; // shared memory size per query in (half)
|
||||
const short T = DK + nsg*SH; // shared memory size per query in (half)
|
||||
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*D); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*D); // same as above but in q4_t
|
||||
threadgroup q4x4_t * sq4x4 = (threadgroup q4x4_t *) (shmem_f16 + 0*D); // same as above but in q4x4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*D); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*D); // same as above but in s4_t
|
||||
threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + C + Q*D); // scratch buffer for mask
|
||||
threadgroup o4x4_t * sr4x4 = (threadgroup o4x4_t *) (shmem_f16 + sgitg*D + Q*T); // scratch buffer for the results
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*DK); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*DK); // same as above but in s4_t
|
||||
threadgroup float * sm = (threadgroup float *) (shmem_f16 + sgitg*SH + 2*C + Q*DK); // scratch buffer for mask
|
||||
threadgroup o4_t * sr4 = (threadgroup o4_t *) (shmem_f16 + sgitg*DV + Q*T); // scratch buffer for the results
|
||||
|
||||
// store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
|
||||
o4x4_t lo[D16/NL];
|
||||
// store the result for all queries in local memory (the O matrix from the paper)
|
||||
o4_t lo[DV4/NL];
|
||||
|
||||
// load heads from Q to shared memory
|
||||
device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*args.nb01 + iq2*args.nb02 + iq3*args.nb03));
|
||||
|
||||
for (short i = tiisg; i < D4; i += NW) {
|
||||
for (short i = tiisg; i < DK4; i += NW) {
|
||||
if (iq1 < args.ne01) {
|
||||
sq4[i] = (q4_t) q4[i];
|
||||
} else {
|
||||
@@ -3648,8 +3672,8 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
}
|
||||
|
||||
// zero out lo
|
||||
for (short i = 0; i < D16/NL; ++i) {
|
||||
lo[i] = (o4x4_t) 0.0f;
|
||||
for (short i = 0; i < DV4/NL; ++i) {
|
||||
lo[i] = (o4_t) 0.0f;
|
||||
}
|
||||
|
||||
// zero out shared memory SH
|
||||
@@ -3660,8 +3684,8 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
{
|
||||
half S = 0.0f;
|
||||
half M = -__FLT16_MAX__/2;
|
||||
float S = 0.0f;
|
||||
float M = -__FLT16_MAX__/2;
|
||||
|
||||
// thread indices inside the simdgroup
|
||||
const short tx = tiisg%NL;
|
||||
@@ -3674,26 +3698,18 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
const short ikv2 = iq2/(args.ne02/args.ne_12_2);
|
||||
const short ikv3 = iq3/(args.ne03/args.ne_12_3);
|
||||
|
||||
// load the queries from shared memory into local memory
|
||||
q4x4_t mq[D16/NL];
|
||||
|
||||
#pragma unroll(D16/NL)
|
||||
for (short ii = 0; ii < D16; ii += NL) {
|
||||
mq[ii/NL] = sq4x4[ii + tx];
|
||||
}
|
||||
|
||||
const bool has_mask = mask != q;
|
||||
|
||||
// pointer to the mask
|
||||
device const half * pm = (device const half *) (mask + iq1*args.nb31);
|
||||
|
||||
half slope = 1.0f;
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (args.max_bias > 0.0f) {
|
||||
const short h = iq2;
|
||||
|
||||
const half base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const float base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, exph);
|
||||
@@ -3713,43 +3729,56 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
// Q*K^T
|
||||
{
|
||||
// each simdgroup processes 1 query and 4 (NW/NL) keys
|
||||
for (short cc = 0; cc < C/4; ++cc) {
|
||||
qk_t mqka[4] = { 0.0, 0.0, 0.0, 0.0 };
|
||||
// each simdgroup processes 1 query and NE (NW/NL) head elements
|
||||
for (short cc = 0; cc < C/NE; ++cc) {
|
||||
qk_t mqk = 0.0f;
|
||||
|
||||
device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
|
||||
device const kd4_t * pk = (device const kd4_t *) ((device const char *) k + ((ic + NE*cc + ty)*args.nb11 + ikv2*args.nb12 + ikv3*args.nb13));
|
||||
|
||||
#pragma unroll(D16/NL)
|
||||
for (short ii = 0; ii < D16; ii += NL) {
|
||||
#pragma unroll(DK4/NL)
|
||||
for (short ii = 0; ii < DK4; ii += NL) {
|
||||
const short i = ii + tx;
|
||||
|
||||
k4x4_t mk;
|
||||
deq_k(pk + i/nl_k, i%nl_k, mk);
|
||||
k4_t mk;
|
||||
deq_k_t4(pk + i/nl_k, i%nl_k, mk);
|
||||
|
||||
// note: this is less precise than the version below
|
||||
//mqka[0] += dot(mq[ii/NL][0], mk[0]);
|
||||
//mqka[1] += dot(mq[ii/NL][1], mk[1]);
|
||||
//mqka[2] += dot(mq[ii/NL][2], mk[2]);
|
||||
//mqka[3] += dot(mq[ii/NL][3], mk[3]);
|
||||
//mqka[0] += dot(mq[0], mk[0]);
|
||||
//mqka[1] += dot(mq[1], mk[1]);
|
||||
//mqka[2] += dot(mq[2], mk[2]);
|
||||
//mqka[3] += dot(mq[3], mk[3]);
|
||||
|
||||
mqka[0] += dot((float4) mq[ii/NL][0], (float4) mk[0]);
|
||||
mqka[1] += dot((float4) mq[ii/NL][1], (float4) mk[1]);
|
||||
mqka[2] += dot((float4) mq[ii/NL][2], (float4) mk[2]);
|
||||
mqka[3] += dot((float4) mq[ii/NL][3], (float4) mk[3]);
|
||||
//q4x4_t mq = sq4x4[i];
|
||||
//mqka[0] += dot((float4) mq[0], (float4) mk[0]);
|
||||
//mqka[1] += dot((float4) mq[1], (float4) mk[1]);
|
||||
//mqka[2] += dot((float4) mq[2], (float4) mk[2]);
|
||||
//mqka[3] += dot((float4) mq[3], (float4) mk[3]);
|
||||
|
||||
mqk += dot((float4) mk, (float4) sq4[i]);
|
||||
}
|
||||
|
||||
qk_t mqk = mqka[0] + mqka[1] + mqka[2] + mqka[3];
|
||||
static_assert(NE > 1, "NE must be > 1"); // note: not sure why NE == 1 fails
|
||||
|
||||
// simdgroup reduce
|
||||
// simdgroup reduce (NE = 4)
|
||||
// [ 0 .. 7] -> [ 0]
|
||||
// [ 8 .. 15] -> [ 8]
|
||||
// [16 .. 23] -> [16]
|
||||
// [24 .. 31] -> [24]
|
||||
//mqk += simd_shuffle_down(mqk, 16);
|
||||
//mqk += simd_shuffle_down(mqk, 8);
|
||||
mqk += simd_shuffle_down(mqk, 4);
|
||||
mqk += simd_shuffle_down(mqk, 2);
|
||||
mqk += simd_shuffle_down(mqk, 1);
|
||||
if (NE <= 1) {
|
||||
mqk += simd_shuffle_down(mqk, 16);
|
||||
}
|
||||
if (NE <= 2) {
|
||||
mqk += simd_shuffle_down(mqk, 8);
|
||||
}
|
||||
if (NE <= 4) {
|
||||
mqk += simd_shuffle_down(mqk, 4);
|
||||
}
|
||||
if (NE <= 8) {
|
||||
mqk += simd_shuffle_down(mqk, 2);
|
||||
}
|
||||
if (NE <= 16) {
|
||||
mqk += simd_shuffle_down(mqk, 1);
|
||||
}
|
||||
|
||||
// mqk = mqk*scale + mask*slope
|
||||
if (tx == 0) {
|
||||
@@ -3759,9 +3788,9 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
mqk = args.logit_softcap*precise::tanh(mqk);
|
||||
}
|
||||
|
||||
mqk += sm[4*cc + ty]*slope;
|
||||
mqk += sm[NE*cc + ty]*slope;
|
||||
|
||||
ss[4*cc + ty] = mqk;
|
||||
ss[NE*cc + ty] = mqk;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3770,13 +3799,13 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
// online softmax
|
||||
{
|
||||
const half m = M;
|
||||
const half s = ss[tiisg];
|
||||
const float m = M;
|
||||
const float s = ss[tiisg];
|
||||
|
||||
M = simd_max(max(M, s));
|
||||
|
||||
const half ms = exp(m - M);
|
||||
const half vs = exp(s - M);
|
||||
const float ms = exp(m - M);
|
||||
const float vs = exp(s - M);
|
||||
|
||||
S = S*ms + simd_sum(vs);
|
||||
|
||||
@@ -3784,8 +3813,8 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
ss[tiisg] = vs;
|
||||
|
||||
// O = diag(ms)*O
|
||||
#pragma unroll(D16/NL)
|
||||
for (short ii = 0; ii < D16; ii += NL) {
|
||||
#pragma unroll(DV4/NL)
|
||||
for (short ii = 0; ii < DV4; ii += NL) {
|
||||
lo[ii/NL] *= ms;
|
||||
}
|
||||
}
|
||||
@@ -3794,19 +3823,20 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
// O = O + (Q*K^T)*V
|
||||
{
|
||||
for (short cc = 0; cc < C/4; ++cc) {
|
||||
device const vd4x4_t * pv4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 4*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
|
||||
//#pragma unroll(C/NE)
|
||||
for (short cc = 0; cc < C/NE; ++cc) {
|
||||
device const vd4_t * pv4 = (device const vd4_t *) ((device const char *) v + ((ic + NE*cc + ty)*args.nb21 + ikv2*args.nb22 + ikv3*args.nb23));
|
||||
|
||||
const s4x4_t ms(ss[4*cc + ty]);
|
||||
const s4_t ms(ss[NE*cc + ty]);
|
||||
|
||||
#pragma unroll(D16/NL)
|
||||
for (short ii = 0; ii < D16; ii += NL) {
|
||||
#pragma unroll(DV4/NL)
|
||||
for (short ii = 0; ii < DV4; ii += NL) {
|
||||
const short i = ii + tx;
|
||||
|
||||
v4x4_t mv;
|
||||
deq_v(pv4 + i/nl_v, i%nl_v, mv);
|
||||
v4_t mv;
|
||||
deq_v_t4(pv4 + i/nl_v, i%nl_v, mv);
|
||||
|
||||
lo[ii/NL] += mv*ms;
|
||||
lo[ii/NL] += o4_t(float4(mv)*float4(ms));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3819,7 +3849,7 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
}
|
||||
}
|
||||
|
||||
// simdgroup reduce
|
||||
// simdgroup reduce (NE = 4)
|
||||
// [ 0, 8, 16, 24] -> [ 0]
|
||||
// [ 1, 9, 17, 25] -> [ 1]
|
||||
// [ 2, 10, 18, 26] -> [ 2]
|
||||
@@ -3828,37 +3858,48 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
// [ 5, 13, 21, 29] -> [ 5]
|
||||
// [ 6, 14, 22, 30] -> [ 6]
|
||||
// [ 7, 15, 23, 31] -> [ 7]
|
||||
for (short ii = 0; ii < D16; ii += NL) {
|
||||
lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 16);
|
||||
lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 8);
|
||||
//lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 4);
|
||||
//lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 2);
|
||||
//lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 1);
|
||||
for (short ii = 0; ii < DV4; ii += NL) {
|
||||
if (NE > 1) {
|
||||
lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 16);
|
||||
lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 16);
|
||||
lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 16);
|
||||
lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 16);
|
||||
}
|
||||
|
||||
lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 16);
|
||||
lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 8);
|
||||
//lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 4);
|
||||
//lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 2);
|
||||
//lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 1);
|
||||
if (NE > 2) {
|
||||
lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 8);
|
||||
lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 8);
|
||||
lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 8);
|
||||
lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 8);
|
||||
}
|
||||
|
||||
lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 16);
|
||||
lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 8);
|
||||
//lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 4);
|
||||
//lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 2);
|
||||
//lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 1);
|
||||
if (NE > 4) {
|
||||
lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 4);
|
||||
lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 4);
|
||||
lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 4);
|
||||
lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 4);
|
||||
}
|
||||
|
||||
lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 16);
|
||||
lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 8);
|
||||
//lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 4);
|
||||
//lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 2);
|
||||
//lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 1);
|
||||
if (NE > 8) {
|
||||
lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 2);
|
||||
lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 2);
|
||||
lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 2);
|
||||
lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 2);
|
||||
}
|
||||
|
||||
if (NE > 16) {
|
||||
lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 1);
|
||||
lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 1);
|
||||
lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 1);
|
||||
lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 1);
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// store results to shared memory
|
||||
for (short i = tiisg; i < D16; i += NL) {
|
||||
sr4x4[i] = lo[i/NL];
|
||||
for (short i = tiisg; i < DV4; i += NL) {
|
||||
sr4[i] = lo[i/NL];
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
@@ -3866,18 +3907,18 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
// parallel reduce
|
||||
for (short r = nsg/2; r > 0; r >>= 1) {
|
||||
if (sgitg < r) {
|
||||
const half S0 = ss[ 0];
|
||||
const half S1 = ss[r*SH + 0];
|
||||
const float S0 = ss[ 0];
|
||||
const float S1 = ss[r*(SH/2) + 0];
|
||||
|
||||
const half M0 = ss[ 1];
|
||||
const half M1 = ss[r*SH + 1];
|
||||
const float M0 = ss[ 1];
|
||||
const float M1 = ss[r*(SH/2) + 1];
|
||||
|
||||
const half M = max(M0, M1);
|
||||
const float M = max(M0, M1);
|
||||
|
||||
const half ms0 = exp(M0 - M);
|
||||
const half ms1 = exp(M1 - M);
|
||||
const float ms0 = exp(M0 - M);
|
||||
const float ms1 = exp(M1 - M);
|
||||
|
||||
const half S = S0*ms0 + S1*ms1;
|
||||
const float S = S0*ms0 + S1*ms1;
|
||||
|
||||
if (tiisg == 0) {
|
||||
ss[0] = S;
|
||||
@@ -3885,22 +3926,22 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
}
|
||||
|
||||
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
|
||||
for (short i = tiisg; i < D16; i += NW) {
|
||||
sr4x4[i] = sr4x4[i]*ms0 + sr4x4[i + r*D16]*ms1;
|
||||
for (short i = tiisg; i < DV4; i += NW) {
|
||||
sr4[i] = sr4[i]*ms0 + sr4[i + r*DV4]*ms1;
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
device float4x4 * dst44 = (device float4x4 *) dst;
|
||||
device float4 * dst4 = (device float4 *) dst;
|
||||
|
||||
// final rescale with 1/S and store to global memory
|
||||
if (sgitg == 0) {
|
||||
const float S = ss[0];
|
||||
|
||||
for (short i = tiisg; i < D16; i += NW) {
|
||||
dst44[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)iq1*args.ne1)*D16 + i] = (float4x4) sr4x4[i]/S;
|
||||
for (short i = tiisg; i < DV4; i += NW) {
|
||||
dst4[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)iq1*args.ne1)*DV4 + i] = (float4) sr4[i]/S;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3909,34 +3950,54 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
// in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max
|
||||
//
|
||||
#define FA_TYPES \
|
||||
half4, half4x4, \
|
||||
half4x4, \
|
||||
half4x4, \
|
||||
float, \
|
||||
half, half4, half4x4, \
|
||||
half4x4
|
||||
half4, \
|
||||
half4, \
|
||||
half4, \
|
||||
float, \
|
||||
float, float4, \
|
||||
half4
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 128>) flash_attn_ext_vec_t;
|
||||
typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 128, 128, 4>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 128, 128, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 128, 128, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 128, 128, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 128, 128, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 128, 128, 4>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 192, 192, 4>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 192, 192, 4>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 192, 192, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 192, 192, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 192, 192, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 192, 192, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 192, 192, 4>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_hk192_hv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 192, 128, 4>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_hk192_hv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 192, 128, 4>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_hk192_hv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 192, 128, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_hk192_hv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 192, 128, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_hk192_hv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 192, 128, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_hk192_hv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 192, 128, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_hk192_hv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 192, 128, 4>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 256, 256, 4>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 256, 256, 4>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 256, 256, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 256, 256, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 256, 256, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 256, 256, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 256, 256, 4>;
|
||||
|
||||
#undef FA_TYPES
|
||||
|
||||
|
||||
@@ -63,6 +63,7 @@ set(GGML_OPENCL_KERNELS
|
||||
ggml-opencl_transpose_16
|
||||
ggml-opencl_transpose_32
|
||||
ggml-opencl_transpose_32_16
|
||||
ggml-opencl_im2col
|
||||
)
|
||||
|
||||
foreach (K ${GGML_OPENCL_KERNELS})
|
||||
|
||||
@@ -224,12 +224,14 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program;
|
||||
cl_program program_1;
|
||||
cl_program program_2;
|
||||
cl_program program_im2col;
|
||||
|
||||
cl_kernel kernel_add, kernel_add_row;
|
||||
cl_kernel kernel_mul, kernel_mul_row;
|
||||
cl_kernel kernel_scale;
|
||||
cl_kernel kernel_silu, kernel_silu_4;
|
||||
cl_kernel kernel_gelu, kernel_gelu_4;
|
||||
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
||||
cl_kernel kernel_relu;
|
||||
cl_kernel kernel_clamp;
|
||||
cl_kernel kernel_norm;
|
||||
@@ -239,6 +241,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
||||
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
||||
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
||||
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
||||
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
||||
cl_kernel kernel_mul_mat_f32_f32;
|
||||
cl_kernel kernel_mul_mat_f16_f16;
|
||||
@@ -252,6 +255,7 @@ struct ggml_backend_opencl_context {
|
||||
kernel_mul_mat_q4_0_f32_flat_img_v0;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q6_K_f32;
|
||||
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Transpose kernels
|
||||
@@ -708,6 +712,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program, "kernel_gelu_quick", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_quick_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err));
|
||||
@@ -722,6 +728,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rope_multi_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_multi_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rope_multi_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_multi_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rope_vision_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_vision_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rope_vision_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_vision_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err));
|
||||
@@ -769,6 +779,19 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err));
|
||||
|
||||
// im2col kernels
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src_im2col {
|
||||
#include "ggml-opencl_im2col.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src_im2col = read_file("ggml-opencl_im2col.cl");
|
||||
#endif
|
||||
backend_ctx->program_im2col = build_program_from_source(context, device, kernel_src_im2col.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col, "kernel_im2col_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col, "kernel_im2col_f16", &err), err));
|
||||
|
||||
// Kernels for Adreno
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -898,10 +921,30 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
|
||||
|
||||
// TODO: fixme: these sizes are hardcoded for now.
|
||||
// they should be allocated based on the model's size
|
||||
// and the device's max alloc size
|
||||
// Allocate intermediate buffers and images
|
||||
size_t max_A_q_d_bytes = 311164928;
|
||||
size_t max_A_s_d_bytes = 38895616;
|
||||
size_t max_B_d_bytes = 45088768;
|
||||
size_t required_A_q_d_bytes = 311164928;
|
||||
size_t required_A_s_d_bytes = 38895616;
|
||||
size_t required_B_d_bytes = 45088768;
|
||||
|
||||
// Ensure buffer sizes do not exceed the maximum allocation size
|
||||
size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, backend_ctx->max_alloc_size);
|
||||
size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, backend_ctx->max_alloc_size);
|
||||
size_t max_B_d_bytes = MIN(required_B_d_bytes, backend_ctx->max_alloc_size);
|
||||
if (required_A_q_d_bytes > backend_ctx->max_alloc_size) {
|
||||
GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
||||
required_A_q_d_bytes, max_A_q_d_bytes);
|
||||
}
|
||||
if (required_A_s_d_bytes > backend_ctx->max_alloc_size) {
|
||||
GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
||||
required_A_s_d_bytes, max_A_s_d_bytes);
|
||||
}
|
||||
if (required_B_d_bytes > backend_ctx->max_alloc_size) {
|
||||
GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
||||
required_B_d_bytes, max_B_d_bytes);
|
||||
}
|
||||
|
||||
CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
|
||||
CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
|
||||
@@ -1187,6 +1230,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
@@ -1216,14 +1260,26 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return op->ne[3] == 1;
|
||||
case GGML_OP_ROPE: {
|
||||
const int mode = ((const int32_t *) op->op_params)[2];
|
||||
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||
if (is_mrope && !is_vision) {
|
||||
if (op->src[0]->type == GGML_TYPE_F32 ||
|
||||
op->src[0]->type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||
if (is_vision) {
|
||||
if (op->src[0]->type == GGML_TYPE_F32 ||
|
||||
op->src[0]->type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2582,6 +2638,53 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
int n = ggml_nelements(dst);
|
||||
|
||||
if (n % 4 == 0) {
|
||||
kernel = backend_ctx->kernel_gelu_quick_4;
|
||||
n /= 4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_gelu_quick;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -3980,6 +4083,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
int32_t sections[4];
|
||||
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
@@ -3987,23 +4091,23 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||
|
||||
if (is_mrope) {
|
||||
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
|
||||
}
|
||||
|
||||
if (is_vision) {
|
||||
GGML_ASSERT(n_dims == ne00/2);
|
||||
}
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
if (!is_neox) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
kernel = backend_ctx->kernel_rope_norm_f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
kernel = backend_ctx->kernel_rope_norm_f16;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
};
|
||||
} else {
|
||||
if (is_neox) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
kernel = backend_ctx->kernel_rope_neox_f32;
|
||||
@@ -4014,6 +4118,39 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
};
|
||||
} else if (is_mrope && !is_vision) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
kernel = backend_ctx->kernel_rope_multi_f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
kernel = backend_ctx->kernel_rope_multi_f16;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
};
|
||||
} else if (is_vision) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
kernel = backend_ctx->kernel_rope_vision_f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
kernel = backend_ctx->kernel_rope_vision_f16;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} else {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
kernel = backend_ctx->kernel_rope_norm_f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
kernel = backend_ctx->kernel_rope_norm_f16;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
};
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
@@ -4049,6 +4186,9 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor));
|
||||
CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast));
|
||||
CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow));
|
||||
if (is_mrope || is_vision) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 33, sizeof(int32_t)*4, §ions));
|
||||
}
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
@@ -4064,6 +4204,98 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
// src0 - filter, src1 - input
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
||||
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
||||
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
||||
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
||||
|
||||
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
||||
|
||||
const cl_long IC = src1->ne[is_2D ? 2 : 1];
|
||||
const cl_long IH = is_2D ? src1->ne[1] : 1;
|
||||
const cl_long IW = src1->ne[0];
|
||||
|
||||
const cl_long KH = is_2D ? src0->ne[1] : 1;
|
||||
const cl_long KW = src0->ne[0];
|
||||
|
||||
const cl_long OH = is_2D ? dst->ne[2] : 1;
|
||||
const cl_long OW = dst->ne[1];
|
||||
|
||||
// nb is byte offset, src is type float32
|
||||
const cl_ulong delta_offset = src1->nb[is_2D ? 2 : 1]/4;
|
||||
const cl_long batch = src1->ne[is_2D ? 3 : 2];
|
||||
const cl_ulong batch_offset = src1->nb[is_2D ? 3 : 2]/4;
|
||||
|
||||
const cl_long pelements = OW*KW*KH;
|
||||
const cl_long CHW = IC*KH*KW;
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
if(dst->type == GGML_TYPE_F16) {
|
||||
kernel = backend_ctx->kernel_im2col_f16;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_im2col_f32;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &batch_offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &delta_offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &IW));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &IH));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &IC));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &OW));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_long), &OH));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_long), &KW));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_long), &KH));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_long), &pelements));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_long), &CHW));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &s0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &s1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &p0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &p1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &d0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &d1));
|
||||
|
||||
const int num_blocks = (pelements + 256 - 1) / 256;
|
||||
size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
|
||||
size_t local_work_size[] = {256, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Op offloading
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -4122,6 +4354,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_gelu;
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_gelu_quick;
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
@@ -4194,6 +4432,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_rope;
|
||||
break;
|
||||
case GGML_OP_IM2COL:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_im2col;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -404,6 +404,7 @@ kernel void kernel_scale(
|
||||
// gelu
|
||||
//------------------------------------------------------------------------------
|
||||
#define GELU_COEF_A 0.044715f
|
||||
#define GELU_QUICK_COEF -1.702f
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f
|
||||
|
||||
kernel void kernel_gelu(
|
||||
@@ -434,6 +435,32 @@ kernel void kernel_gelu_4(
|
||||
dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_quick(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
float x = src0[get_global_id(0)];
|
||||
dst[get_global_id(0)] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_quick_4(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
float4 x = src0[get_global_id(0)];
|
||||
dst[get_global_id(0)] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// silu
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -1325,6 +1352,368 @@ kernel void kernel_rope_neox_f16(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_rope_multi_f32(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global int * src1,
|
||||
ulong offset1,
|
||||
global float * src2,
|
||||
ulong offset2,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow,
|
||||
int4 sections
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global int*)((global char*)src1 + offset1);
|
||||
src2 = (global float*)((global char*)src2 + offset2);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i3 = get_group_id(2);
|
||||
int i2 = get_group_id(1);
|
||||
int i1 = get_group_id(0);
|
||||
|
||||
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
||||
|
||||
global int * pos = src1;
|
||||
|
||||
const int sect_dims = sections.s0 + sections.s1 + sections.s2 + sections.s3;
|
||||
const int sec_w = sections.s1 + sections.s0;
|
||||
|
||||
float inv_ndims = -1.f/n_dims;
|
||||
|
||||
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
||||
if (i0 < n_dims) {
|
||||
int ic = i0/2;
|
||||
|
||||
const int sector = (i0 / 2) % sect_dims;
|
||||
float theta_base = 0.0f;
|
||||
|
||||
if (sector < sections.s0) {
|
||||
theta_base = pos[i2];
|
||||
}
|
||||
else if (sector >= sections.s0 && sector < sec_w) {
|
||||
theta_base = pos[i2 + ne2 * 1];
|
||||
}
|
||||
else if (sector >= sec_w && sector < sec_w + sections.s2) {
|
||||
theta_base = pos[i2 + ne2 * 2];
|
||||
}
|
||||
else if (sector >= sec_w + sections.s2) {
|
||||
theta_base = pos[i2 + ne2 * 3];
|
||||
}
|
||||
|
||||
const float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
||||
|
||||
float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
||||
|
||||
global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
||||
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims/2];
|
||||
|
||||
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
||||
dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
||||
} else {
|
||||
global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_rope_multi_f16(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global int * src1,
|
||||
ulong offset1,
|
||||
global float * src2,
|
||||
ulong offset2,
|
||||
global half * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow,
|
||||
int4 sections
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global int*)((global char*)src1 + offset1);
|
||||
src2 = (global float*)((global char*)src2 + offset2);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i3 = get_group_id(2);
|
||||
int i2 = get_group_id(1);
|
||||
int i1 = get_group_id(0);
|
||||
|
||||
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
||||
|
||||
global int * pos = src1;
|
||||
|
||||
const int sect_dims = sections.s0 + sections.s1 + sections.s2 + sections.s3;
|
||||
const int sec_w = sections.s1 + sections.s0;
|
||||
|
||||
float inv_ndims = -1.f/n_dims;
|
||||
|
||||
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
||||
if (i0 < n_dims) {
|
||||
int ic = i0/2;
|
||||
|
||||
const int sector = (i0 / 2) % sect_dims;
|
||||
float theta_base = 0.0f;
|
||||
|
||||
if (sector < sections.s0) {
|
||||
theta_base = pos[i2];
|
||||
}
|
||||
else if (sector >= sections.s0 && sector < sec_w) {
|
||||
theta_base = pos[i2 + ne2 * 1];
|
||||
}
|
||||
else if (sector >= sec_w && sector < sec_w + sections.s2) {
|
||||
theta_base = pos[i2 + ne2 * 2];
|
||||
}
|
||||
else if (sector >= sec_w + sections.s2) {
|
||||
theta_base = pos[i2 + ne2 * 3];
|
||||
}
|
||||
|
||||
const float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
||||
|
||||
float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
||||
|
||||
global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
||||
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims/2];
|
||||
|
||||
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
||||
dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
||||
} else {
|
||||
global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_rope_vision_f32(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global int * src1,
|
||||
ulong offset1,
|
||||
global float * src2,
|
||||
ulong offset2,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow,
|
||||
int4 sections
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global int*)((global char*)src1 + offset1);
|
||||
src2 = (global float*)((global char*)src2 + offset2);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i3 = get_group_id(2);
|
||||
int i2 = get_group_id(1);
|
||||
int i1 = get_group_id(0);
|
||||
|
||||
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
||||
|
||||
global int * pos = src1;
|
||||
|
||||
const int sect_dims = sections.s0 + sections.s1;
|
||||
const int sec_w = sections.s1 + sections.s0;
|
||||
|
||||
float inv_ndims = -1.f/n_dims;
|
||||
|
||||
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
||||
int ic = i0/2;
|
||||
|
||||
const int sector = (i0/2) % sect_dims;
|
||||
float theta_base = 0.0f;
|
||||
|
||||
if (sector < sections.s0) {
|
||||
const int p = sector;
|
||||
theta_base = pos[i2] * pow(freq_base, inv_ndims*2.0f*p);
|
||||
} else if (sector >= sections.s0 && sector < sec_w) {
|
||||
const int p = sector - sections.s0;
|
||||
theta_base = pos[i2 + ne2] * pow(freq_base, inv_ndims*2.0f*p);
|
||||
}
|
||||
|
||||
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
||||
|
||||
float2 cos_sin_theta = rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
||||
|
||||
global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
||||
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims];
|
||||
|
||||
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
||||
dst_data[n_dims] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_rope_vision_f16(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global int * src1,
|
||||
ulong offset1,
|
||||
global float * src2,
|
||||
ulong offset2,
|
||||
global half * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow,
|
||||
int4 sections
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global int*)((global char*)src1 + offset1);
|
||||
src2 = (global float*)((global char*)src2 + offset2);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i3 = get_group_id(2);
|
||||
int i2 = get_group_id(1);
|
||||
int i1 = get_group_id(0);
|
||||
|
||||
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
||||
|
||||
global int * pos = src1;
|
||||
|
||||
const int sect_dims = sections.s0 + sections.s1;
|
||||
const int sec_w = sections.s1 + sections.s0;
|
||||
|
||||
float inv_ndims = -1.f/n_dims;
|
||||
|
||||
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
||||
int ic = i0/2;
|
||||
|
||||
const int sector = (i0/2) % sect_dims;
|
||||
float theta_base = 0.0f;
|
||||
|
||||
if (sector < sections.s0) {
|
||||
const int p = sector;
|
||||
theta_base = pos[i2] * pow(freq_base, inv_ndims*2.0f*p);
|
||||
} else if (sector >= sections.s0 && sector < sec_w) {
|
||||
const int p = sector - sections.s0;
|
||||
theta_base = pos[i2 + ne2] * pow(freq_base, inv_ndims*2.0f*p);
|
||||
}
|
||||
|
||||
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
||||
|
||||
float2 cos_sin_theta = rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
||||
|
||||
global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
||||
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims];
|
||||
|
||||
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
||||
dst_data[n_dims] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// cpy
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
146
ggml/src/ggml-opencl/kernels/ggml-opencl_im2col.cl
Normal file
146
ggml/src/ggml-opencl/kernels/ggml-opencl_im2col.cl
Normal file
@@ -0,0 +1,146 @@
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#elif defined(cl_amd_fp16)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
|
||||
#else
|
||||
#error "Half precision floating point not supportedby OpenCL implementation on your device."
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_subgroups
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#elif defined(cl_intel_subgroups)
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#error "Subgroup not supported on your device."
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
// Always use subgroup size of 32 on Intel.
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
// Always use subgroups size of 64 on Adreno.
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#else
|
||||
// TODO: do not know how to choose subgroup size on other GPUs.
|
||||
#error "Selecting subgroup size is not supported on your device."
|
||||
#endif
|
||||
|
||||
kernel void kernel_im2col_f32(
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
ulong batch_offset,
|
||||
ulong delta_offset,
|
||||
long IW,
|
||||
long IH,
|
||||
long IC,
|
||||
long OW,
|
||||
long OH,
|
||||
long KW,
|
||||
long KH,
|
||||
long pelements,
|
||||
long CHW,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1
|
||||
) {
|
||||
// threadIdx.x + blockIdx.x * blockDim.x
|
||||
long i = get_global_id(0);
|
||||
if (i >= pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
long ksize = OW * (KH > 1 ? KW : 1);
|
||||
long kx = i / ksize;
|
||||
long kd = kx * ksize;
|
||||
long ky = (i - kd) / OW;
|
||||
long ix = i % OW;
|
||||
|
||||
long oh = get_group_id(1);
|
||||
long batch = get_group_id(2) / IC;
|
||||
long ic = get_group_id(2) % IC;
|
||||
|
||||
long iiw = ix * s0 + kx * d0 - p0;
|
||||
long iih = oh * s1 + ky * d1 - p1;
|
||||
|
||||
long offset_dst =
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
long offset_src = ic * delta_offset + batch * batch_offset;
|
||||
dst[offset_dst] = src1[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_im2col_f16(
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global half * dst,
|
||||
ulong offsetd,
|
||||
ulong batch_offset,
|
||||
ulong delta_offset,
|
||||
long IW,
|
||||
long IH,
|
||||
long IC,
|
||||
long OW,
|
||||
long OH,
|
||||
long KW,
|
||||
long KH,
|
||||
long pelements,
|
||||
long CHW,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1
|
||||
) {
|
||||
long i = get_global_id(0);
|
||||
|
||||
if (i >= pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
long ksize = OW * (KH > 1 ? KW : 1);
|
||||
long kx = i / ksize;
|
||||
long kd = kx * ksize;
|
||||
long ky = (i - kd) / OW;
|
||||
long ix = i % OW;
|
||||
|
||||
long oh = get_group_id(1);
|
||||
long batch = get_group_id(2) / IC;
|
||||
long ic = get_group_id(2) % IC;
|
||||
|
||||
long iiw = ix * s0 + kx * d0 - p0;
|
||||
long iih = oh * s1 + ky * d1 - p1;
|
||||
|
||||
long offset_dst =
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
long offset_src = ic * delta_offset + batch * batch_offset;
|
||||
dst[offset_dst] = src1[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
@@ -26,6 +26,10 @@
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <filesystem>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
#ifdef _WIN32
|
||||
typedef SOCKET sockfd_t;
|
||||
@@ -80,6 +84,7 @@ enum rpc_cmd {
|
||||
RPC_CMD_FREE_BUFFER,
|
||||
RPC_CMD_BUFFER_CLEAR,
|
||||
RPC_CMD_SET_TENSOR,
|
||||
RPC_CMD_SET_TENSOR_HASH,
|
||||
RPC_CMD_GET_TENSOR,
|
||||
RPC_CMD_COPY_TENSOR,
|
||||
RPC_CMD_GRAPH_COMPUTE,
|
||||
@@ -89,6 +94,9 @@ enum rpc_cmd {
|
||||
RPC_CMD_COUNT,
|
||||
};
|
||||
|
||||
// Try RPC_CMD_SET_TENSOR_HASH first when data size is larger than this threshold
|
||||
const size_t HASH_THRESHOLD = 10 * 1024 * 1024;
|
||||
|
||||
struct rpc_msg_get_alloc_size_req {
|
||||
rpc_tensor tensor;
|
||||
};
|
||||
@@ -135,6 +143,10 @@ struct rpc_msg_buffer_clear_req {
|
||||
uint8_t value;
|
||||
};
|
||||
|
||||
struct rpc_msg_set_tensor_hash_rsp {
|
||||
uint8_t result;
|
||||
};
|
||||
|
||||
struct rpc_msg_get_tensor_req {
|
||||
rpc_tensor tensor;
|
||||
uint64_t offset;
|
||||
@@ -187,6 +199,18 @@ struct ggml_backend_rpc_buffer_context {
|
||||
|
||||
// RPC helper functions
|
||||
|
||||
// Computes FNV-1a hash of the data
|
||||
static uint64_t fnv_hash(const uint8_t * data, size_t len) {
|
||||
const uint64_t fnv_prime = 0x100000001b3ULL;
|
||||
uint64_t hash = 0xcbf29ce484222325ULL;
|
||||
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
hash ^= data[i];
|
||||
hash *= fnv_prime;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
|
||||
#ifdef _WIN32
|
||||
if (fd == INVALID_SOCKET) {
|
||||
@@ -483,10 +507,26 @@ static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
// input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
|
||||
rpc_tensor rpc_tensor = serialize_tensor(tensor);
|
||||
if (size > HASH_THRESHOLD) {
|
||||
// input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes)
|
||||
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t);
|
||||
std::vector<uint8_t> input(input_size, 0);
|
||||
uint64_t hash = fnv_hash((const uint8_t*)data, size);
|
||||
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
||||
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash));
|
||||
rpc_msg_set_tensor_hash_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
if (response.result) {
|
||||
// the server has the same data, no need to send it
|
||||
return;
|
||||
}
|
||||
}
|
||||
// input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes)
|
||||
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
|
||||
std::vector<uint8_t> input(input_size, 0);
|
||||
rpc_tensor rpc_tensor = serialize_tensor(tensor);
|
||||
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
||||
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
||||
@@ -772,7 +812,9 @@ void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, si
|
||||
|
||||
class rpc_server {
|
||||
public:
|
||||
rpc_server(ggml_backend_t backend) : backend(backend) {}
|
||||
rpc_server(ggml_backend_t backend, const char * cache_dir)
|
||||
: backend(backend), cache_dir(cache_dir) {
|
||||
}
|
||||
~rpc_server();
|
||||
|
||||
void alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response);
|
||||
@@ -782,6 +824,7 @@ public:
|
||||
bool free_buffer(const rpc_msg_free_buffer_req & request);
|
||||
bool buffer_clear(const rpc_msg_buffer_clear_req & request);
|
||||
bool set_tensor(const std::vector<uint8_t> & input);
|
||||
bool set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response);
|
||||
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
|
||||
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
|
||||
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
|
||||
@@ -789,6 +832,7 @@ public:
|
||||
bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
|
||||
|
||||
private:
|
||||
bool get_cached_file(uint64_t hash, std::vector<uint8_t> & data);
|
||||
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
|
||||
ggml_tensor * create_node(uint64_t id,
|
||||
struct ggml_context * ctx,
|
||||
@@ -797,6 +841,7 @@ private:
|
||||
|
||||
|
||||
ggml_backend_t backend;
|
||||
const char * cache_dir;
|
||||
std::unordered_set<ggml_backend_buffer_t> buffers;
|
||||
};
|
||||
|
||||
@@ -960,11 +1005,85 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
}
|
||||
|
||||
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
|
||||
if (cache_dir && size > HASH_THRESHOLD) {
|
||||
uint64_t hash = fnv_hash((const uint8_t*)data, size);
|
||||
char hash_str[17];
|
||||
snprintf(hash_str, sizeof(hash_str), "%016" PRIx64, hash);
|
||||
// save to cache_dir/hash_str
|
||||
fs::path cache_file = fs::path(cache_dir) / hash_str;
|
||||
std::ofstream ofs(cache_file, std::ios::binary);
|
||||
ofs.write((const char *)data, size);
|
||||
printf("[%s] saved to '%s'\n", __func__, cache_file.c_str());
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, data, offset, size);
|
||||
ggml_free(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
|
||||
if (!cache_dir) {
|
||||
return false;
|
||||
}
|
||||
char hash_str[17];
|
||||
snprintf(hash_str, sizeof(hash_str), "%016" PRIx64, hash);
|
||||
fs::path cache_file = fs::path(cache_dir) / hash_str;
|
||||
if (!fs::exists(cache_file)) {
|
||||
return false;
|
||||
}
|
||||
std::ifstream ifs(cache_file, std::ios::binary);
|
||||
ifs.seekg(0, std::ios::end);
|
||||
size_t size = ifs.tellg();
|
||||
ifs.seekg(0, std::ios::beg);
|
||||
data.resize(size);
|
||||
ifs.read((char *)data.data(), size);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response)
|
||||
{
|
||||
// serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes) |
|
||||
if (input.size() != sizeof(rpc_tensor) + 16) {
|
||||
return false;
|
||||
}
|
||||
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
||||
uint64_t offset;
|
||||
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
||||
const uint64_t * hash = (const uint64_t *)(input.data() + sizeof(rpc_tensor) + sizeof(offset));
|
||||
std::vector<uint8_t> cached_file;
|
||||
if (!get_cached_file(*hash, cached_file)) {
|
||||
response.result = 0;
|
||||
return true;
|
||||
}
|
||||
size_t size = cached_file.size();
|
||||
struct ggml_init_params params {
|
||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
||||
if (tensor == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
ggml_free(ctx);
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size, *hash);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
|
||||
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
|
||||
|
||||
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
|
||||
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, cached_file.data(), offset, size);
|
||||
response.result = 1;
|
||||
ggml_free(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
|
||||
struct ggml_init_params params {
|
||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||
@@ -1148,8 +1267,9 @@ rpc_server::~rpc_server() {
|
||||
}
|
||||
}
|
||||
|
||||
static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) {
|
||||
rpc_server server(backend);
|
||||
static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
sockfd_t sockfd, size_t free_mem, size_t total_mem) {
|
||||
rpc_server server(backend, cache_dir);
|
||||
while (true) {
|
||||
uint8_t cmd;
|
||||
if (!recv_data(sockfd, &cmd, 1)) {
|
||||
@@ -1260,6 +1380,20 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RPC_CMD_SET_TENSOR_HASH: {
|
||||
std::vector<uint8_t> input;
|
||||
if (!recv_msg(sockfd, input)) {
|
||||
return;
|
||||
}
|
||||
rpc_msg_set_tensor_hash_rsp response;
|
||||
if (!server.set_tensor_hash(input, response)) {
|
||||
return;
|
||||
}
|
||||
if (!send_msg(sockfd, &response, sizeof(response))) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RPC_CMD_INIT_TENSOR: {
|
||||
rpc_msg_init_tensor_req request;
|
||||
if (!recv_msg(sockfd, &request,sizeof(request))) {
|
||||
@@ -1335,7 +1469,9 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
|
||||
void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
|
||||
const char * cache_dir,
|
||||
size_t free_mem, size_t total_mem) {
|
||||
std::string host;
|
||||
int port;
|
||||
if (!parse_endpoint(endpoint, host, port)) {
|
||||
@@ -1364,7 +1500,7 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
|
||||
}
|
||||
printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
|
||||
fflush(stdout);
|
||||
rpc_serve_client(backend, client_socket->fd, free_mem, total_mem);
|
||||
rpc_serve_client(backend, cache_dir, client_socket->fd, free_mem, total_mem);
|
||||
printf("Client connection closed\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
@@ -23,6 +23,23 @@ ggml_add_backend_library(ggml-sycl
|
||||
../../include/ggml-sycl.h
|
||||
)
|
||||
|
||||
file(GLOB GGML_HEADERS_SYCL "*.hpp")
|
||||
file(GLOB GGML_SOURCES_SYCL "*.cpp")
|
||||
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
|
||||
|
||||
find_package(IntelSYCL)
|
||||
if (IntelSYCL_FOUND)
|
||||
# Use oneAPI CMake when possible
|
||||
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX)
|
||||
else()
|
||||
# Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl")
|
||||
endif()
|
||||
|
||||
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
|
||||
|
||||
# Link against oneDNN
|
||||
find_package(DNNL)
|
||||
set(GGML_SYCL_DNNL 0)
|
||||
if(DNNL_FOUND)
|
||||
@@ -62,8 +79,6 @@ if (GGML_SYCL_F16)
|
||||
add_compile_definitions(GGML_SYCL_F16)
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
|
||||
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
@@ -76,34 +91,84 @@ else()
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||
endif()
|
||||
|
||||
file(GLOB GGML_HEADERS_SYCL "*.hpp")
|
||||
file(GLOB GGML_SOURCES_SYCL "*.cpp")
|
||||
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
|
||||
if (GGML_SYCL_GRAPH)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
|
||||
endif()
|
||||
|
||||
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
# Link against Intel oneMKL or oneMath
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
# Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
|
||||
# See https://github.com/uxlfoundation/oneMath/issues/654
|
||||
find_package(MKL REQUIRED)
|
||||
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
else()
|
||||
if (GGML_SYCL_GRAPH)
|
||||
add_compile_definitions(GGML_SYCL_GRAPH)
|
||||
find_package(oneMath QUIET)
|
||||
if (NOT oneMath_FOUND)
|
||||
message(STATUS "oneMath not found: oneMath will be automatically downloaded")
|
||||
# Use FetchContent to automatically pull and build oneMath
|
||||
include(FetchContent)
|
||||
set(BUILD_FUNCTIONAL_TESTS False)
|
||||
set(BUILD_EXAMPLES False)
|
||||
set(TARGET_DOMAINS blas)
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(ENABLE_MKLCPU_BACKEND False)
|
||||
set(ENABLE_MKLGPU_BACKEND False)
|
||||
set(ENABLE_CUBLAS_BACKEND True)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
set(ENABLE_MKLCPU_BACKEND False)
|
||||
set(ENABLE_MKLGPU_BACKEND False)
|
||||
set(ENABLE_ROCBLAS_BACKEND True)
|
||||
# Ensure setting a string variable here is not overriden by oneMath CACHE variables
|
||||
cmake_policy(SET CMP0126 NEW)
|
||||
# Setting the device architecture is only needed and useful for AMD devices in oneMath
|
||||
set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
|
||||
endif()
|
||||
FetchContent_Declare(
|
||||
ONEMATH
|
||||
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
||||
GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
|
||||
)
|
||||
FetchContent_MakeAvailable(ONEMATH)
|
||||
# Create alias to match with find_package targets name
|
||||
function(onemath_alias target)
|
||||
if (TARGET ${target}_obj)
|
||||
# Silence verbose warnings from external libraries
|
||||
target_compile_options(${target}_obj PRIVATE -w)
|
||||
endif()
|
||||
if (TARGET ${target})
|
||||
add_library(ONEMATH::${target} ALIAS ${target})
|
||||
endif()
|
||||
endfunction()
|
||||
onemath_alias(onemath)
|
||||
onemath_alias(onemath_blas_mklcpu)
|
||||
onemath_alias(onemath_blas_mklgpu)
|
||||
onemath_alias(onemath_blas_cublas)
|
||||
onemath_alias(onemath_blas_rocblas)
|
||||
endif()
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
add_compile_definitions(GGML_SYCL_NVIDIA)
|
||||
target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas)
|
||||
|
||||
# Below oneMath compile-time dispatching is used for better performance
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
if (NOT GGML_SYCL_DEVICE_ARCH)
|
||||
message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
|
||||
endif()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
|
||||
else()
|
||||
# Fallback to oneMath runtime dispatcher
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
endif()
|
||||
|
||||
@@ -66,41 +66,6 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
|
||||
return sycl_down_blk_size;
|
||||
}
|
||||
|
||||
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const ggml_sycl_op_flatten_t op) try {
|
||||
|
||||
const bool use_src1 = src1 != nullptr;
|
||||
if(use_src1)
|
||||
GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
|
||||
GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
|
||||
|
||||
// dd = data device
|
||||
float * src0_ddf = (float *) src0->data;
|
||||
float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
|
||||
ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> dst_f(ctx.pool());
|
||||
|
||||
ggml_sycl_set_device(ctx.device);
|
||||
queue_ptr main_stream = ctx.stream();
|
||||
// GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
|
||||
// ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);
|
||||
|
||||
// do the computation
|
||||
op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
||||
// print_ggml_tensor("tensor", dst);
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
|
||||
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
|
||||
|
||||
@@ -494,12 +494,6 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
|
||||
|
||||
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
|
||||
|
||||
typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1,
|
||||
ggml_tensor *dst, const float *src0_dd,
|
||||
const float *src1_dd, float *dst_dd,
|
||||
const queue_ptr &main_stream);
|
||||
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
@@ -757,24 +751,22 @@ struct bin_bcast_sycl {
|
||||
|
||||
template <class op>
|
||||
inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
float *dst_dd,
|
||||
const queue_ptr &main_stream) {
|
||||
const ggml_tensor *src1, ggml_tensor *dst) {
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
||||
op()(ctx, src0, src1, dst, (const float *)src0->data, (const float *)src1->data, (float *)dst->data, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
|
||||
(sycl::half *)dst_dd, main_stream);
|
||||
op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data,
|
||||
(sycl::half *)dst->data, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
|
||||
op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data, (float *)dst->data,
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
||||
op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
|
||||
op()(ctx, src0, src1, dst, (const int32_t *)src0->data, (const int32_t *)src1->data, (int32_t *)dst->data,
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
|
||||
op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
|
||||
op()(ctx, src0, src1, dst, (const int16_t *)src0->data, (const int16_t *)src1->data, (int16_t *)dst->data,
|
||||
main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
@@ -784,8 +776,4 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
|
||||
}
|
||||
|
||||
bool gpu_has_xmx(sycl::device &dev);
|
||||
|
||||
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const ggml_sycl_op_flatten_t op);
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user