mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-18 23:14:06 +00:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7ea8d80d53 | ||
|
|
42c76d1358 | ||
|
|
9f7d4bcf5c | ||
|
|
1d1ccce676 | ||
|
|
9fe94ccac9 | ||
|
|
66b039a501 | ||
|
|
20f1789dfb | ||
|
|
231cff5f6f | ||
|
|
3246fe84d7 | ||
|
|
78eb487bb0 | ||
|
|
a77feb5d71 | ||
|
|
2e59d61c1b | ||
|
|
75e1dbbaab |
@@ -1,18 +1,16 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
|
||||
ARG CUDA_VERSION=12.6.0
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
@@ -24,13 +22,12 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable CUDA
|
||||
ENV GGML_CUDA=1
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
RUN make -j$(nproc)
|
||||
# Use the default CUDA archs if not specified
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-cli -j$(nproc) && \
|
||||
cp build/bin/* .
|
||||
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
ARG CUDA_VERSION=12.6.0
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
@@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
apt-get install -y build-essential git cmake
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable CUDA
|
||||
ENV GGML_CUDA=1
|
||||
|
||||
RUN make -j$(nproc) llama-cli
|
||||
# Use the default CUDA archs if not specified
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-cli -j$(nproc)
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
||||
COPY --from=build /app/llama-cli /llama-cli
|
||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
ARG CUDA_VERSION=12.6.0
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
@@ -8,31 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable CUDA
|
||||
ENV GGML_CUDA=1
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
RUN make -j$(nproc) llama-server
|
||||
# Use the default CUDA archs if not specified
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-server -j$(nproc)
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||
|
||||
COPY --from=build /app/llama-server /llama-server
|
||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
@@ -26,6 +26,8 @@ RUN apt-get update && \
|
||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
@@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
ENV GGML_HIPBLAS=1
|
||||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
@@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
@@ -21,6 +21,8 @@ RUN apt-get update && \
|
||||
COPY --from=build /app/llama-server /llama-server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
15
.github/workflows/docker.yml
vendored
15
.github/workflows/docker.yml
vendored
@@ -96,21 +96,12 @@ jobs:
|
||||
env:
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
- name: Build and push Docker image (versioned)
|
||||
- name: Build and push Docker image (tagged + versioned)
|
||||
if: github.event_name == 'push'
|
||||
uses: docker/build-push-action@v4
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
- name: Build and push Docker image (tagged)
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
@@ -251,6 +251,57 @@ int32_t cpu_get_num_math() {
|
||||
return cpu_get_num_physical_cores();
|
||||
}
|
||||
|
||||
// Helper for setting process priority
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
||||
return true;
|
||||
}
|
||||
|
||||
DWORD p = NORMAL_PRIORITY_CLASS;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
|
||||
}
|
||||
|
||||
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
||||
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#else // MacOS and POSIX
|
||||
#include <sys/types.h>
|
||||
#include <sys/resource.h>
|
||||
|
||||
bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int p = 0;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
||||
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
||||
}
|
||||
|
||||
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
||||
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
@@ -277,6 +328,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
||||
int32_t n_set = 0;
|
||||
|
||||
if (cpuparams.n_threads < 0) {
|
||||
// Assuming everything about cpuparams is invalid
|
||||
if (role_model != nullptr) {
|
||||
cpuparams = *role_model;
|
||||
} else {
|
||||
cpuparams.n_threads = cpu_get_num_math();
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
|
||||
if (cpuparams.cpumask[i]) {
|
||||
n_set++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_set && n_set < cpuparams.n_threads) {
|
||||
// Not enough set bits, may experience performance issues.
|
||||
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
||||
}
|
||||
}
|
||||
|
||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
@@ -296,6 +371,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
||||
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
||||
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
||||
|
||||
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||
}
|
||||
@@ -327,7 +407,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
void gpt_params_parse_from_env(gpt_params & params) {
|
||||
// we only care about server-related params for now
|
||||
get_env("LLAMA_ARG_MODEL", params.model);
|
||||
get_env("LLAMA_ARG_THREADS", params.n_threads);
|
||||
get_env("LLAMA_ARG_MODEL_URL", params.model_url);
|
||||
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
|
||||
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
|
||||
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
|
||||
get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads);
|
||||
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
||||
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
||||
get_env("LLAMA_ARG_BATCH", params.n_batch);
|
||||
@@ -341,6 +425,9 @@ void gpt_params_parse_from_env(gpt_params & params) {
|
||||
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
|
||||
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
|
||||
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
|
||||
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
|
||||
get_env("LLAMA_ARG_HOST", params.hostname);
|
||||
get_env("LLAMA_ARG_PORT", params.port);
|
||||
}
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
@@ -361,6 +448,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
||||
size_t dash_loc = range.find('-');
|
||||
if (dash_loc == std::string::npos) {
|
||||
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t start_i;
|
||||
size_t end_i;
|
||||
|
||||
if (dash_loc == 0) {
|
||||
start_i = 0;
|
||||
} else {
|
||||
start_i = std::stoull(range.substr(0, dash_loc));
|
||||
if (start_i >= GGML_MAX_N_THREADS) {
|
||||
fprintf(stderr, "Start index out of bounds!\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (dash_loc == range.length() - 1) {
|
||||
end_i = GGML_MAX_N_THREADS - 1;
|
||||
} else {
|
||||
end_i = std::stoull(range.substr(dash_loc + 1));
|
||||
if (end_i >= GGML_MAX_N_THREADS) {
|
||||
fprintf(stderr, "End index out of bounds!\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = start_i; i <= end_i; i++) {
|
||||
boolmask[i] = true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
||||
// Discard potential 0x prefix
|
||||
size_t start_i = 0;
|
||||
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
|
||||
start_i = 2;
|
||||
}
|
||||
|
||||
size_t num_digits = mask.length() - start_i;
|
||||
if (num_digits > 128) num_digits = 128;
|
||||
|
||||
size_t end_i = num_digits + start_i;
|
||||
|
||||
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
|
||||
char c = mask.at(i);
|
||||
int8_t id = c;
|
||||
|
||||
if ((c >= '0' && c <= '9')) {
|
||||
id -= '0';
|
||||
} else if (c >= 'a' && c <= 'f') {
|
||||
id -= 'a' - 10;
|
||||
} else if (c >= 'A' && c <= 'F') {
|
||||
id -= 'A' - 10;
|
||||
} else {
|
||||
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
||||
return false;
|
||||
}
|
||||
|
||||
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
|
||||
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
|
||||
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
|
||||
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
|
||||
|
||||
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
||||
@@ -377,36 +537,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
}
|
||||
if (arg == "-t" || arg == "--threads") {
|
||||
CHECK_ARG
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
if (params.n_threads <= 0) {
|
||||
params.n_threads = std::thread::hardware_concurrency();
|
||||
params.cpuparams.n_threads = std::stoi(argv[i]);
|
||||
if (params.cpuparams.n_threads <= 0) {
|
||||
params.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-C" || arg == "--cpu-mask") {
|
||||
CHECK_ARG
|
||||
std::string mask = argv[i];
|
||||
params.cpuparams.mask_valid = true;
|
||||
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Cr" || arg == "--cpu-range") {
|
||||
CHECK_ARG
|
||||
std::string range = argv[i];
|
||||
params.cpuparams.mask_valid = true;
|
||||
invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--prio") {
|
||||
CHECK_ARG
|
||||
params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--cpu-strict") {
|
||||
CHECK_ARG
|
||||
params.cpuparams.strict_cpu = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--poll") {
|
||||
CHECK_ARG
|
||||
params.cpuparams.poll = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-tb" || arg == "--threads-batch") {
|
||||
CHECK_ARG
|
||||
params.n_threads_batch = std::stoi(argv[i]);
|
||||
if (params.n_threads_batch <= 0) {
|
||||
params.n_threads_batch = std::thread::hardware_concurrency();
|
||||
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
|
||||
if (params.cpuparams_batch.n_threads <= 0) {
|
||||
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Cb" || arg == "--cpu-mask-batch") {
|
||||
CHECK_ARG
|
||||
std::string mask = argv[i];
|
||||
params.cpuparams_batch.mask_valid = true;
|
||||
invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Crb" || arg == "--cpu-range_batch") {
|
||||
CHECK_ARG
|
||||
std::string range = argv[i];
|
||||
params.cpuparams_batch.mask_valid = true;
|
||||
invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--prio-batch") {
|
||||
CHECK_ARG
|
||||
params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--cpu-strict-batch") {
|
||||
params.cpuparams_batch.strict_cpu = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--poll-batch") {
|
||||
CHECK_ARG
|
||||
params.cpuparams_batch.poll = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-td" || arg == "--threads-draft") {
|
||||
CHECK_ARG
|
||||
params.n_threads_draft = std::stoi(argv[i]);
|
||||
if (params.n_threads_draft <= 0) {
|
||||
params.n_threads_draft = std::thread::hardware_concurrency();
|
||||
params.draft_cpuparams.n_threads = std::stoi(argv[i]);
|
||||
if (params.draft_cpuparams.n_threads <= 0) {
|
||||
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Cd" || arg == "--cpu-mask-draft") {
|
||||
CHECK_ARG
|
||||
std::string mask = argv[i];
|
||||
params.draft_cpuparams.mask_valid = true;
|
||||
invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Crd" || arg == "--cpu-range-draft") {
|
||||
CHECK_ARG
|
||||
std::string range = argv[i];
|
||||
params.draft_cpuparams.mask_valid = true;
|
||||
invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--prio-draft") {
|
||||
CHECK_ARG
|
||||
params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--cpu-strict-draft") {
|
||||
params.draft_cpuparams.strict_cpu = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--poll-draft") {
|
||||
CHECK_ARG
|
||||
params.draft_cpuparams.poll = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
||||
CHECK_ARG
|
||||
params.n_threads_batch_draft = std::stoi(argv[i]);
|
||||
if (params.n_threads_batch_draft <= 0) {
|
||||
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
||||
params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
|
||||
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
||||
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
|
||||
CHECK_ARG
|
||||
std::string range = argv[i];
|
||||
params.draft_cpuparams_batch.mask_valid = true;
|
||||
invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--prio-batch-draft") {
|
||||
CHECK_ARG
|
||||
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--cpu-strict-batch-draft") {
|
||||
params.draft_cpuparams_batch.strict_cpu = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--poll-batch-draft") {
|
||||
CHECK_ARG
|
||||
params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-p" || arg == "--prompt") {
|
||||
CHECK_ARG
|
||||
params.prompt = argv[i];
|
||||
@@ -1491,11 +1757,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
||||
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
||||
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
|
||||
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
||||
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
||||
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
|
||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
||||
options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
||||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
// these options are available only with the internal threadpool
|
||||
options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
|
||||
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
|
||||
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
|
||||
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
|
||||
options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
|
||||
|
||||
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
|
||||
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
|
||||
options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
|
||||
options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
|
||||
options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"});
|
||||
|
||||
options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
|
||||
options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
|
||||
options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
|
||||
options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
|
||||
options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"});
|
||||
|
||||
options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
|
||||
options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
|
||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
|
||||
options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>",
|
||||
"Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
|
||||
options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
|
||||
options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
|
||||
#endif // GGML_USE_OPENMP
|
||||
|
||||
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
||||
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
||||
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
||||
@@ -1767,7 +2062,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
||||
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
||||
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
||||
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
||||
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
@@ -1799,9 +2093,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
std::string gpt_params_get_system_info(const gpt_params & params) {
|
||||
std::ostringstream os;
|
||||
|
||||
os << "system_info: n_threads = " << params.n_threads;
|
||||
if (params.n_threads_batch != -1) {
|
||||
os << " (n_threads_batch = " << params.n_threads_batch << ")";
|
||||
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
||||
if (params.cpuparams_batch.n_threads != -1) {
|
||||
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
|
||||
}
|
||||
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
||||
// TODO: windows + arm64 + mingw64
|
||||
@@ -2325,8 +2619,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
cparams.n_seq_max = params.n_parallel;
|
||||
cparams.n_batch = params.n_batch;
|
||||
cparams.n_ubatch = params.n_ubatch;
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
cparams.n_threads = params.cpuparams.n_threads;
|
||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||
cparams.seed = params.seed;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embeddings = params.embedding;
|
||||
@@ -2352,6 +2647,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
return cparams;
|
||||
}
|
||||
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
|
||||
struct ggml_threadpool_params tpp;
|
||||
|
||||
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
|
||||
|
||||
if (params.mask_valid) {
|
||||
std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS);
|
||||
}
|
||||
|
||||
tpp.prio = params.priority;
|
||||
tpp.poll = params.poll;
|
||||
tpp.strict_cpu = params.strict_cpu;
|
||||
|
||||
return tpp;
|
||||
}
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
static bool starts_with(const std::string & str, const std::string & prefix) {
|
||||
@@ -3341,7 +3652,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
||||
|
||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
||||
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||
|
||||
@@ -67,13 +67,18 @@ enum dimre_method {
|
||||
DIMRE_METHOD_MEAN,
|
||||
};
|
||||
|
||||
struct cpu_params {
|
||||
int n_threads = -1;
|
||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||
bool mask_valid = false; // Default: any CPU
|
||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||
bool strict_cpu = false; // Use strict CPU placement
|
||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||
};
|
||||
|
||||
struct gpt_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
|
||||
int32_t n_threads = cpu_get_num_math();
|
||||
int32_t n_threads_draft = -1;
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 0; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
@@ -100,6 +105,11 @@ struct gpt_params {
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
struct cpu_params draft_cpuparams;
|
||||
struct cpu_params draft_cpuparams_batch;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||
void * cb_eval_user_data = nullptr;
|
||||
|
||||
@@ -204,7 +214,7 @@ struct gpt_params {
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
||||
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = "";
|
||||
@@ -277,6 +287,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||
|
||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||
|
||||
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
||||
bool set_process_priority(enum ggml_sched_priority prio);
|
||||
|
||||
//
|
||||
// String utils
|
||||
//
|
||||
@@ -327,8 +342,9 @@ struct llama_init_result {
|
||||
|
||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
||||
|
||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||
|
||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
|
||||
@@ -1572,7 +1572,7 @@ class LlamaModel(Model):
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||
base = self.hparams.get("rope_theta", 10000.0)
|
||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
|
||||
factor = rope_scaling.get("factor", 8.0)
|
||||
@@ -3820,7 +3820,7 @@ class ExaoneModel(Model):
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||
base = self.hparams.get("rope_theta", 10000.0)
|
||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
|
||||
factor = rope_scaling.get("factor", 8.0)
|
||||
|
||||
@@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
|
||||
|
||||
The defaults are:
|
||||
|
||||
- `CUDA_VERSION` set to `11.7.1`
|
||||
- `CUDA_DOCKER_ARCH` set to `all`
|
||||
- `CUDA_VERSION` set to `12.6.0`
|
||||
- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
|
||||
|
||||
The resulting images, are essentially the same as the non-CUDA images:
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
|
||||
#endif
|
||||
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
#endif
|
||||
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
@@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
||||
|
||||
struct benchmark_params_struct {
|
||||
int32_t n_threads = 1;
|
||||
int n_threads = 1;
|
||||
int32_t n_iterations = 10;
|
||||
};
|
||||
|
||||
|
||||
@@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
|
||||
if (use_pca) {
|
||||
// run PCA
|
||||
PCA::pca_params pca_params;
|
||||
pca_params.n_threads = params.n_threads;
|
||||
pca_params.n_batch = params.n_pca_batch;
|
||||
pca_params.n_threads = params.cpuparams.n_threads;
|
||||
pca_params.n_batch = params.n_pca_batch;
|
||||
pca_params.n_iterations = params.n_pca_iterations;
|
||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||
} else {
|
||||
|
||||
@@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
g_verbose = (params.verbosity == 1);
|
||||
try {
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
||||
ctx.run_merge();
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s\n", err.what());
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
@@ -225,6 +226,9 @@ struct cmd_params {
|
||||
std::vector<ggml_type> type_k;
|
||||
std::vector<ggml_type> type_v;
|
||||
std::vector<int> n_threads;
|
||||
std::vector<std::string> cpu_mask;
|
||||
std::vector<bool> cpu_strict;
|
||||
std::vector<int> poll;
|
||||
std::vector<int> n_gpu_layers;
|
||||
std::vector<std::string> rpc_servers;
|
||||
std::vector<llama_split_mode> split_mode;
|
||||
@@ -236,6 +240,8 @@ struct cmd_params {
|
||||
std::vector<bool> embeddings;
|
||||
ggml_numa_strategy numa;
|
||||
int reps;
|
||||
ggml_sched_priority prio;
|
||||
int delay;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
output_formats output_format_stderr;
|
||||
@@ -251,6 +257,9 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {cpu_get_num_math()},
|
||||
/* cpu_mask */ {"0x0"},
|
||||
/* cpu_strict */ {false},
|
||||
/* poll */ {50},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* rpc_servers */ {""},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
@@ -262,6 +271,8 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* embeddings */ {false},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
||||
/* delay */ 0,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN,
|
||||
/* output_format_stderr */ NONE,
|
||||
@@ -281,6 +292,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
|
||||
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
@@ -292,6 +306,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
@@ -338,6 +354,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||
params.reps = cmd_params_defaults.reps;
|
||||
params.numa = cmd_params_defaults.numa;
|
||||
params.prio = cmd_params_defaults.prio;
|
||||
params.delay = cmd_params_defaults.delay;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
@@ -433,6 +451,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||
} else if (arg == "-C" || arg == "--cpu-mask") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
||||
} else if (arg == "--cpu-strict") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
||||
} else if (arg == "--poll") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -541,6 +580,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
params.reps = std::stoi(argv[i]);
|
||||
} else if (arg == "--prio") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
||||
} else if (arg == "--delay") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.delay = std::stoi(argv[i]);
|
||||
} else if (arg == "-o" || arg == "--output") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -585,6 +636,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
|
||||
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
|
||||
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
|
||||
|
||||
return params;
|
||||
}
|
||||
@@ -598,6 +652,9 @@ struct cmd_params_instance {
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
int n_threads;
|
||||
std::string cpu_mask;
|
||||
bool cpu_strict;
|
||||
int poll;
|
||||
int n_gpu_layers;
|
||||
std::string rpc_servers;
|
||||
llama_split_mode split_mode;
|
||||
@@ -667,7 +724,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & tv : params.type_v)
|
||||
for (const auto & nkvo : params.no_kv_offload)
|
||||
for (const auto & fa : params.flash_attn)
|
||||
for (const auto & nt : params.n_threads) {
|
||||
for (const auto & nt : params.n_threads)
|
||||
for (const auto & cm : params.cpu_mask)
|
||||
for (const auto & cs : params.cpu_strict)
|
||||
for (const auto & pl : params.poll) {
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
if (n_prompt == 0) {
|
||||
continue;
|
||||
@@ -681,6 +741,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .cpu_mask = */ cm,
|
||||
/* .cpu_strict = */ cs,
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
@@ -707,6 +770,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .cpu_mask = */ cm,
|
||||
/* .cpu_strict = */ cs,
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
@@ -733,6 +799,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .cpu_mask = */ cm,
|
||||
/* .cpu_strict = */ cs,
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
@@ -769,6 +838,9 @@ struct test {
|
||||
int n_batch;
|
||||
int n_ubatch;
|
||||
int n_threads;
|
||||
std::string cpu_mask;
|
||||
bool cpu_strict;
|
||||
int poll;
|
||||
bool has_rpc;
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
@@ -795,6 +867,9 @@ struct test {
|
||||
n_batch = inst.n_batch;
|
||||
n_ubatch = inst.n_ubatch;
|
||||
n_threads = inst.n_threads;
|
||||
cpu_mask = inst.cpu_mask;
|
||||
cpu_strict = inst.cpu_strict;
|
||||
poll = inst.poll;
|
||||
has_rpc = !inst.rpc_servers.empty();
|
||||
type_k = inst.type_k;
|
||||
type_v = inst.type_v;
|
||||
@@ -872,13 +947,14 @@ struct test {
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_ubatch",
|
||||
"n_threads", "type_k", "type_v",
|
||||
"n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||
"type_k", "type_v",
|
||||
"n_gpu_layers", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn",
|
||||
"tensor_split", "use_mmap", "embeddings",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
"avg_ns", "stddev_ns",
|
||||
"avg_ts", "stddev_ts"
|
||||
"avg_ts", "stddev_ts",
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
@@ -887,7 +963,7 @@ struct test {
|
||||
|
||||
static field_type get_field_type(const std::string & field) {
|
||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
||||
field == "n_threads" ||
|
||||
field == "n_threads" || field == "poll" ||
|
||||
field == "model_size" || field == "model_n_params" ||
|
||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||
field == "n_prompt" || field == "n_gen" ||
|
||||
@@ -896,6 +972,7 @@ struct test {
|
||||
}
|
||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "cpu_strict" ||
|
||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
||||
return BOOL;
|
||||
}
|
||||
@@ -928,7 +1005,8 @@ struct test {
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
|
||||
ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
||||
@@ -1067,7 +1145,7 @@ struct markdown_printer : public printer {
|
||||
return -30;
|
||||
}
|
||||
if (field == "t/s") {
|
||||
return 16;
|
||||
return 20;
|
||||
}
|
||||
if (field == "size" || field == "params") {
|
||||
return 10;
|
||||
@@ -1149,6 +1227,15 @@ struct markdown_printer : public printer {
|
||||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||
fields.emplace_back("n_threads");
|
||||
}
|
||||
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
|
||||
fields.emplace_back("cpu_mask");
|
||||
}
|
||||
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
|
||||
fields.emplace_back("cpu_strict");
|
||||
}
|
||||
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
|
||||
fields.emplace_back("poll");
|
||||
}
|
||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||
fields.emplace_back("n_batch");
|
||||
}
|
||||
@@ -1383,6 +1470,8 @@ int main(int argc, char ** argv) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
set_process_priority(params.prio);
|
||||
|
||||
// initialize printer
|
||||
std::unique_ptr<printer> p = create_printer(params.output_format);
|
||||
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
||||
@@ -1428,6 +1517,28 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
// cool off before the test
|
||||
if (params.delay) {
|
||||
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
|
||||
}
|
||||
|
||||
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
||||
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
||||
LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
||||
exit(1);
|
||||
}
|
||||
tpp.strict_cpu = t.cpu_strict;
|
||||
tpp.poll = t.poll;
|
||||
tpp.prio = params.prio;
|
||||
|
||||
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
||||
if (!threadpool) {
|
||||
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
llama_attach_threadpool(ctx, threadpool, NULL);
|
||||
|
||||
// warmup run
|
||||
if (t.n_prompt > 0) {
|
||||
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
||||
@@ -1466,6 +1577,8 @@ int main(int argc, char ** argv) {
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
}
|
||||
|
||||
llama_free_model(lmodel);
|
||||
|
||||
@@ -71,8 +71,8 @@ actor LlamaContext {
|
||||
var ctx_params = llama_context_default_params()
|
||||
ctx_params.seed = 1234
|
||||
ctx_params.n_ctx = 2048
|
||||
ctx_params.n_threads = UInt32(n_threads)
|
||||
ctx_params.n_threads_batch = UInt32(n_threads)
|
||||
ctx_params.n_threads = Int32(n_threads)
|
||||
ctx_params.n_threads_batch = Int32(n_threads)
|
||||
|
||||
let context = llama_new_context_with_model(model, ctx_params)
|
||||
guard let context else {
|
||||
|
||||
@@ -15,8 +15,8 @@ cd llama.cpp
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -1623,7 +1623,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
|
||||
}
|
||||
}
|
||||
|
||||
inline float clip(float x, float lower, float upper) {
|
||||
inline int clip(int x, int lower, int upper) {
|
||||
return std::max(lower, std::min(x, upper));
|
||||
}
|
||||
|
||||
@@ -1827,10 +1827,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
|
||||
return refine_size;
|
||||
}
|
||||
|
||||
inline int clip(int x, int lower, int upper) {
|
||||
return std::max(lower, std::min(x, upper));
|
||||
}
|
||||
|
||||
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
||||
std::vector<int> candidate_split_grids_nums;
|
||||
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||
|
||||
@@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
||||
if (!params->image.empty()) {
|
||||
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
||||
}
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||
if (!embed) {
|
||||
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
params->prompt = remove_image_from_prompt(prompt);
|
||||
} else {
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embed) {
|
||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||
return NULL;
|
||||
|
||||
@@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
|
||||
|
||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||
auto ctx_clip = clip_init_context(params);
|
||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embeds) {
|
||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||
return NULL;
|
||||
|
||||
@@ -221,6 +221,40 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("%s: llama threadpool init = n_threads = %d\n",
|
||||
__func__,
|
||||
(int) params.cpuparams.n_threads
|
||||
);
|
||||
struct ggml_threadpool_params tpp_batch =
|
||||
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||
struct ggml_threadpool_params tpp =
|
||||
ggml_threadpool_params_from_cpu_params(params.cpuparams);
|
||||
|
||||
set_process_priority(params.cpuparams.priority);
|
||||
|
||||
struct ggml_threadpool * threadpool_batch = NULL;
|
||||
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||
if (!threadpool_batch) {
|
||||
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Start the non-batch threadpool in the paused state
|
||||
tpp.paused = true;
|
||||
}
|
||||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||
if (!threadpool) {
|
||||
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
||||
if (ctx_guidance) {
|
||||
llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG("n_ctx: %d\n", n_ctx);
|
||||
@@ -989,6 +1023,9 @@ int main(int argc, char ** argv) {
|
||||
llama_sampling_free(ctx_sampling);
|
||||
llama_backend_free();
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
ggml_threadpool_free(threadpool_batch);
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
LOG_TEE("Log end\n");
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
@@ -249,23 +249,49 @@ logging:
|
||||
|
||||
Available environment variables (if specified, these variables will override parameters specified in arguments):
|
||||
|
||||
- `LLAMA_CACHE` (cache directory, used by `--hf-repo`)
|
||||
- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`)
|
||||
- `LLAMA_ARG_MODEL`
|
||||
- `LLAMA_ARG_THREADS`
|
||||
- `LLAMA_ARG_CTX_SIZE`
|
||||
- `LLAMA_ARG_N_PARALLEL`
|
||||
- `LLAMA_ARG_BATCH`
|
||||
- `LLAMA_ARG_UBATCH`
|
||||
- `LLAMA_ARG_N_GPU_LAYERS`
|
||||
- `LLAMA_ARG_THREADS_HTTP`
|
||||
- `LLAMA_ARG_CHAT_TEMPLATE`
|
||||
- `LLAMA_ARG_N_PREDICT`
|
||||
- `LLAMA_ARG_ENDPOINT_METRICS`
|
||||
- `LLAMA_ARG_ENDPOINT_SLOTS`
|
||||
- `LLAMA_ARG_EMBEDDINGS`
|
||||
- `LLAMA_ARG_FLASH_ATTN`
|
||||
- `LLAMA_ARG_DEFRAG_THOLD`
|
||||
- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
|
||||
- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
|
||||
- `LLAMA_ARG_MODEL`: equivalent to `-m`
|
||||
- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
|
||||
- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
|
||||
- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
|
||||
- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
|
||||
- `LLAMA_ARG_THREADS`: equivalent to `-t`
|
||||
- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
|
||||
- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
|
||||
- `LLAMA_ARG_BATCH`: equivalent to `-b`
|
||||
- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
|
||||
- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
|
||||
- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
|
||||
- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
|
||||
- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
|
||||
- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
|
||||
- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
|
||||
- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
|
||||
- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
|
||||
- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
|
||||
- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
|
||||
- `LLAMA_ARG_HOST`: equivalent to `--host`
|
||||
- `LLAMA_ARG_PORT`: equivalent to `--port`
|
||||
|
||||
Example usage of docker compose with environment variables:
|
||||
|
||||
```yml
|
||||
services:
|
||||
llamacpp-server:
|
||||
image: ghcr.io/ggerganov/llama.cpp:server
|
||||
ports:
|
||||
- 8080:8080
|
||||
volumes:
|
||||
- ./models:/models
|
||||
environment:
|
||||
# alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
|
||||
LLAMA_ARG_MODEL: /models/my_model.gguf
|
||||
LLAMA_ARG_CTX_SIZE: 4096
|
||||
LLAMA_ARG_N_PARALLEL: 2
|
||||
LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0
|
||||
LLAMA_ARG_PORT: 8080
|
||||
```
|
||||
|
||||
## Build
|
||||
|
||||
|
||||
@@ -2534,8 +2534,8 @@ int main(int argc, char ** argv) {
|
||||
});
|
||||
|
||||
LOG_INFO("system info", {
|
||||
{"n_threads", params.n_threads},
|
||||
{"n_threads_batch", params.n_threads_batch},
|
||||
{"n_threads", params.cpuparams.n_threads},
|
||||
{"n_threads_batch", params.cpuparams_batch.n_threads},
|
||||
{"total_threads", std::thread::hardware_concurrency()},
|
||||
{"system_info", llama_print_system_info()},
|
||||
});
|
||||
@@ -2572,7 +2572,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
auto res_error = [](httplib::Response & res, json error_data) {
|
||||
json final_response {{"error", error_data}};
|
||||
res.set_content(final_response.dump(), MIMETYPE_JSON);
|
||||
res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
|
||||
res.status = json_value(error_data, "code", 500);
|
||||
};
|
||||
|
||||
|
||||
@@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
|
||||
// load the draft model
|
||||
params.model = params.model_draft;
|
||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
if (params.n_threads_draft > 0) {
|
||||
params.n_threads = params.n_threads_draft;
|
||||
if (params.draft_cpuparams.n_threads > 0) {
|
||||
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
|
||||
}
|
||||
params.n_threads_batch = params.n_threads_batch_draft;
|
||||
|
||||
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
||||
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
||||
model_dft = llama_init_dft.model;
|
||||
ctx_dft = llama_init_dft.context;
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1723637854,
|
||||
"narHash": "sha256-med8+5DSWa2UnOqtdICndjDAEjxr5D7zaIiK4pn0Q7c=",
|
||||
"lastModified": 1724224976,
|
||||
"narHash": "sha256-Z/ELQhrSd7bMzTO8r7NZgi9g5emh+aRKoCdaAv5fiO0=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "c3aa7b8938b17aebd2deecf7be0636000d62a2b9",
|
||||
"rev": "c374d94f1536013ca8e92341b540eba4c22f9c62",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -7,8 +7,8 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
// Tensor allocator
|
||||
struct ggml_tallocr {
|
||||
|
||||
@@ -63,6 +63,7 @@ extern "C" {
|
||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
// "offset" refers to the offset of the tensor data for setting/getting data
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
@@ -102,6 +103,7 @@ extern "C" {
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
|
||||
@@ -220,7 +220,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
||||
#define GGML_FILE_VERSION 1
|
||||
#define GGML_FILE_VERSION 2
|
||||
|
||||
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
||||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||
@@ -231,6 +231,8 @@
|
||||
#define GGML_MAX_SRC 10
|
||||
#ifndef GGML_MAX_NAME
|
||||
#define GGML_MAX_NAME 64
|
||||
#define GGML_MAX_N_THREADS 512
|
||||
|
||||
#endif
|
||||
#define GGML_MAX_OP_PARAMS 64
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
@@ -453,6 +455,8 @@ extern "C" {
|
||||
GGML_OP_SQR,
|
||||
GGML_OP_SQRT,
|
||||
GGML_OP_LOG,
|
||||
GGML_OP_SIN,
|
||||
GGML_OP_COS,
|
||||
GGML_OP_SUM,
|
||||
GGML_OP_SUM_ROWS,
|
||||
GGML_OP_MEAN,
|
||||
@@ -490,9 +494,11 @@ extern "C" {
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_IM2COL,
|
||||
GGML_OP_IM2COL_BACK,
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
GGML_OP_POOL_2D_BACK,
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_ARANGE,
|
||||
@@ -624,6 +630,29 @@ extern "C" {
|
||||
// If it returns true, the computation is aborted
|
||||
typedef bool (*ggml_abort_callback)(void * data);
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
@@ -631,6 +660,7 @@ extern "C" {
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
@@ -969,6 +999,22 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_sin(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_sin_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cos(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cos_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// return scalar
|
||||
GGML_API struct ggml_tensor * ggml_sum(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1566,34 +1612,49 @@ extern "C" {
|
||||
float min,
|
||||
float max);
|
||||
|
||||
// im2col
|
||||
// converts data into a format that effectively results in a convolution when combined with matrix multiplication
|
||||
GGML_API struct ggml_tensor * ggml_im2col(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1,
|
||||
bool is_2D,
|
||||
enum ggml_type dst_type);
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride dimension 0
|
||||
int s1, // stride dimension 1
|
||||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1, // dilation dimension 1
|
||||
bool is_2D,
|
||||
enum ggml_type dst_type);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_im2col_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // gradient of im2col output
|
||||
int64_t * ne, // shape of im2col input
|
||||
int s0, // stride dimension 0
|
||||
int s1, // stride dimension 1
|
||||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1, // dilation dimension 1
|
||||
bool is_2D);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1);
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride dimension 0
|
||||
int s1, // stride dimension 1
|
||||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride
|
||||
int p0, // padding
|
||||
int d0); // dilation
|
||||
@@ -1602,29 +1663,29 @@ extern "C" {
|
||||
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
||||
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s,
|
||||
int d);
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s, // stride
|
||||
int d); // dilation
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int p0,
|
||||
int d0);
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride
|
||||
int p0, // padding
|
||||
int d0); // dilation
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1);
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride dimension 0
|
||||
int s1, // stride dimension 1
|
||||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
|
||||
// kernel size is a->ne[0] x a->ne[1]
|
||||
@@ -1686,6 +1747,18 @@ extern "C" {
|
||||
float p0,
|
||||
float p1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_pool_2d_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * af, // "a"/input used in forward pass
|
||||
enum ggml_op_pool op,
|
||||
int k0,
|
||||
int k1,
|
||||
int s0,
|
||||
int s1,
|
||||
float p0,
|
||||
float p1);
|
||||
|
||||
// nearest interpolate
|
||||
// multiplies ne0 and ne1 by scale factor
|
||||
// used in stable-diffusion
|
||||
@@ -2010,10 +2083,23 @@ extern "C" {
|
||||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
|
||||
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
@@ -1247,7 +1247,7 @@ endif()
|
||||
|
||||
# Data types, macros and functions related to controlling CPU affinity and
|
||||
# some memory allocation are available on Linux through GNU extensions in libc
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
add_compile_definitions(_GNU_SOURCE)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -722,9 +722,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
#endif
|
||||
|
||||
struct ggml_backend_cpu_context {
|
||||
int n_threads;
|
||||
void * work_data;
|
||||
size_t work_size;
|
||||
int n_threads;
|
||||
ggml_threadpool_t threadpool;
|
||||
|
||||
void * work_data;
|
||||
size_t work_size;
|
||||
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
@@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||
|
||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||
|
||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||
|
||||
if (cpu_plan->cplan.work_size > 0) {
|
||||
@@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
|
||||
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
|
||||
if (cpu_ctx->work_size < cplan.work_size) {
|
||||
free(cpu_ctx->work_data);
|
||||
@@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
}
|
||||
|
||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||
ctx->threadpool = NULL;
|
||||
ctx->work_data = NULL;
|
||||
ctx->work_size = 0;
|
||||
ctx->abort_callback = NULL;
|
||||
@@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
|
||||
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
||||
// already had a different threadpool, pause/suspend it before switching
|
||||
ggml_threadpool_pause(ctx->threadpool);
|
||||
}
|
||||
ctx->threadpool = threadpool;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
|
||||
@@ -9,8 +9,10 @@
|
||||
#include "ggml-cuda/binbcast.cuh"
|
||||
#include "ggml-cuda/clamp.cuh"
|
||||
#include "ggml-cuda/concat.cuh"
|
||||
#include "ggml-cuda/conv-transpose-1d.cuh"
|
||||
#include "ggml-cuda/convert.cuh"
|
||||
#include "ggml-cuda/cpy.cuh"
|
||||
#include "ggml-cuda/cross-entropy-loss.cuh"
|
||||
#include "ggml-cuda/diagmask.cuh"
|
||||
#include "ggml-cuda/dmmv.cuh"
|
||||
#include "ggml-cuda/fattn.cuh"
|
||||
@@ -29,7 +31,6 @@
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
#include "ggml-cuda/unary.cuh"
|
||||
#include "ggml-cuda/upscale.cuh"
|
||||
#include "ggml-cuda/conv-transpose-1d.cuh"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@@ -2181,6 +2182,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_ADD:
|
||||
ggml_cuda_op_add(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SUB:
|
||||
ggml_cuda_op_sub(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ACC:
|
||||
ggml_cuda_op_acc(ctx, dst);
|
||||
break;
|
||||
@@ -2267,6 +2271,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_SQRT:
|
||||
ggml_cuda_op_sqrt(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
ggml_cuda_op_sin(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_COS:
|
||||
ggml_cuda_op_cos(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
ggml_cuda_op_clamp(ctx, dst);
|
||||
break;
|
||||
@@ -2303,6 +2313,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
ggml_cuda_flash_attn_ext(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
ggml_cuda_cross_entropy_loss(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2610,6 +2623,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] != nullptr) {
|
||||
assert(node->src[j]->buffer);
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
||||
}
|
||||
}
|
||||
@@ -2853,12 +2867,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
@@ -2890,6 +2907,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
}
|
||||
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
|
||||
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
return true;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
default:
|
||||
return false;
|
||||
|
||||
@@ -9,6 +9,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_sub(const float a, const float b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
||||
return a * b;
|
||||
}
|
||||
@@ -271,6 +275,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||
}
|
||||
|
||||
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||
}
|
||||
|
||||
@@ -2,5 +2,6 @@
|
||||
|
||||
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
106
ggml/src/ggml-cuda/cross-entropy-loss.cu
Normal file
106
ggml/src/ggml-cuda/cross-entropy-loss.cu
Normal file
@@ -0,0 +1,106 @@
|
||||
#include "common.cuh"
|
||||
#include "cross-entropy-loss.cuh"
|
||||
#include "sumrows.cuh"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) {
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE;
|
||||
|
||||
const int ne_tmp = WARP_SIZE*nclasses;
|
||||
|
||||
extern __shared__ float tmp_all[];
|
||||
float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
|
||||
float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
|
||||
|
||||
// Each warp first loads ne_tmp logits/labels into shared memory:
|
||||
for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
|
||||
const int ig = i0*nclasses + i; // ig == i global
|
||||
|
||||
tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
|
||||
tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
|
||||
}
|
||||
|
||||
// Each thread in the warp then calculates the cross entropy loss for a single row.
|
||||
// TODO: pad in order to avoid shared memory bank conflicts.
|
||||
|
||||
// Find maximum for softmax:
|
||||
float max = -INFINITY;
|
||||
for (int i = 0; i < nclasses; ++i) {
|
||||
max = fmaxf(max, tmp_logits[lane_id*nclasses + i]);
|
||||
}
|
||||
|
||||
// Calculate log(softmax(logits)) which is just logits - max:
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < nclasses; ++i) {
|
||||
float val = tmp_logits[lane_id*nclasses + i] - max;
|
||||
sum += expf(val);
|
||||
tmp_logits[lane_id*nclasses + i] = val;
|
||||
}
|
||||
sum = logf(sum);
|
||||
|
||||
// log(exp(logits - max) / sum) = (logits - max) - log(sum)
|
||||
float loss = 0.0f;
|
||||
for (int i = 0; i < nclasses; ++i) {
|
||||
loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i];
|
||||
}
|
||||
loss = -warp_reduce_sum(loss) / (float)k;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (lane_id == 0) {
|
||||
tmp_all[warp_id] = loss;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (warp_id != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
|
||||
loss = warp_reduce_sum(loss);
|
||||
|
||||
if (lane_id != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[blockIdx.x] = loss;
|
||||
}
|
||||
|
||||
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
|
||||
const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
|
||||
const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float);
|
||||
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
|
||||
|
||||
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
|
||||
// Combine results from individual blocks:
|
||||
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
|
||||
}
|
||||
5
ggml/src/ggml-cuda/cross-entropy-loss.cuh
Normal file
5
ggml/src/ggml-cuda/cross-entropy-loss.cuh
Normal file
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -16,7 +16,7 @@ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int nc
|
||||
}
|
||||
}
|
||||
|
||||
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
@@ -32,7 +32,6 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
|
||||
const int64_t ncols = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
|
||||
|
||||
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -101,6 +101,24 @@ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
|
||||
dst[i] = sqrtf(x[i]);
|
||||
}
|
||||
|
||||
static __global__ void sin_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = sinf(x[i]);
|
||||
}
|
||||
|
||||
static __global__ void cos_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = cosf(x[i]);
|
||||
}
|
||||
|
||||
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
||||
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
@@ -156,6 +174,16 @@ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
||||
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
|
||||
sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
|
||||
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
@@ -312,3 +340,31 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
@@ -9,6 +9,8 @@
|
||||
#define CUDA_HARDSWISH_BLOCK_SIZE 256
|
||||
#define CUDA_SQR_BLOCK_SIZE 256
|
||||
#define CUDA_SQRT_BLOCK_SIZE 256
|
||||
#define CUDA_SIN_BLOCK_SIZE 256
|
||||
#define CUDA_COS_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -31,3 +33,7 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||
void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -31,6 +31,8 @@ struct ggml_metal_kernel {
|
||||
enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_ADD,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_SUB,
|
||||
GGML_METAL_KERNEL_TYPE_SUB_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_MUL,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_DIV,
|
||||
@@ -207,6 +209,9 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
|
||||
GGML_METAL_KERNEL_TYPE_CONCAT,
|
||||
GGML_METAL_KERNEL_TYPE_SQR,
|
||||
GGML_METAL_KERNEL_TYPE_SQRT,
|
||||
GGML_METAL_KERNEL_TYPE_SIN,
|
||||
GGML_METAL_KERNEL_TYPE_COS,
|
||||
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
|
||||
|
||||
GGML_METAL_KERNEL_TYPE_COUNT
|
||||
@@ -493,6 +498,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
|
||||
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB, sub, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
||||
@@ -669,6 +676,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||
}
|
||||
|
||||
@@ -769,15 +779,20 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
return true;
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
@@ -1057,6 +1072,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
@@ -1080,6 +1096,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
nb = ne00 / 4;
|
||||
switch (dst->op) {
|
||||
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
|
||||
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
@@ -1089,6 +1106,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
} else {
|
||||
switch (dst->op) {
|
||||
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
|
||||
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
@@ -1416,6 +1434,48 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_SQRT:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_SIN:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_COS:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_SUM_ROWS:
|
||||
|
||||
@@ -17,7 +17,7 @@ enum ggml_sort_order {
|
||||
GGML_SORT_ORDER_DESC,
|
||||
};
|
||||
|
||||
// general-purpose kernel for addition, multiplication and division of two tensors
|
||||
// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across all dims
|
||||
// cons: not very efficient
|
||||
kernel void kernel_add(
|
||||
@@ -70,6 +70,56 @@ kernel void kernel_add(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_sub(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int64_t & offs,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i03 = tgpig.z;
|
||||
const int64_t i02 = tgpig.y;
|
||||
const int64_t i01 = tgpig.x;
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
|
||||
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + offs;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) - *((device float *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -226,6 +276,15 @@ kernel void kernel_add_row(
|
||||
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
||||
}
|
||||
|
||||
kernel void kernel_sub_row(
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
constant uint64_t & nb [[buffer(28)]],
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] - src1[tpig % nb];
|
||||
}
|
||||
|
||||
kernel void kernel_mul_row(
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
@@ -358,6 +417,27 @@ kernel void kernel_sqr(
|
||||
dst[tpig] = src0[tpig] * src0[tpig];
|
||||
}
|
||||
|
||||
kernel void kernel_sqrt(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = sqrt(src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_sin(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = sin(src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_cos(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = cos(src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_sum_rows(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
|
||||
@@ -3644,7 +3644,7 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q8_K_ref(x, y, k);
|
||||
}
|
||||
|
||||
//===================================== Dot ptoducts =================================
|
||||
//===================================== Dot products =================================
|
||||
|
||||
//
|
||||
// Helper functions
|
||||
|
||||
@@ -188,6 +188,8 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_upscale_f32;
|
||||
vk_pipeline pipeline_scale_f32;
|
||||
vk_pipeline pipeline_sqr_f32;
|
||||
vk_pipeline pipeline_sin_f32;
|
||||
vk_pipeline pipeline_cos_f32;
|
||||
vk_pipeline pipeline_clamp_f32;
|
||||
vk_pipeline pipeline_pad_f32;
|
||||
vk_pipeline pipeline_repeat_f32;
|
||||
@@ -1702,6 +1704,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
@@ -4023,6 +4027,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_sqr_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_SIN:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_sin_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_COS:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_cos_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CLAMP:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_clamp_f32;
|
||||
@@ -4171,6 +4185,8 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_REPEAT:
|
||||
@@ -4381,6 +4397,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_REPEAT:
|
||||
@@ -4598,6 +4616,32 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
@@ -5658,6 +5702,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_CPY:
|
||||
@@ -5735,6 +5781,14 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_SQR:
|
||||
ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
ggml_vk_sin(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_COS:
|
||||
ggml_vk_cos(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
|
||||
@@ -5851,6 +5905,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_CPY:
|
||||
@@ -6582,6 +6638,8 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_CONT:
|
||||
@@ -7024,6 +7082,10 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
|
||||
} else if (tensor->op == GGML_OP_SQR) {
|
||||
tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
|
||||
} else if (tensor->op == GGML_OP_SIN) {
|
||||
tensor_clone = ggml_sin(ggml_ctx, src0_clone);
|
||||
} else if (tensor->op == GGML_OP_COS) {
|
||||
tensor_clone = ggml_cos(ggml_ctx, src0_clone);
|
||||
} else if (tensor->op == GGML_OP_CLAMP) {
|
||||
tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
||||
} else if (tensor->op == GGML_OP_PAD) {
|
||||
|
||||
1554
ggml/src/ggml.c
1554
ggml/src/ggml.c
File diff suppressed because it is too large
Load Diff
15
ggml/src/vulkan-shaders/cos.comp
Normal file
15
ggml/src/vulkan-shaders/cos.comp
Normal file
@@ -0,0 +1,15 @@
|
||||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
|
||||
}
|
||||
15
ggml/src/vulkan-shaders/sin.comp
Normal file
15
ggml/src/vulkan-shaders/sin.comp
Normal file
@@ -0,0 +1,15 @@
|
||||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
|
||||
}
|
||||
@@ -396,6 +396,14 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
@@ -304,8 +304,8 @@ extern "C" {
|
||||
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
||||
uint32_t n_ubatch; // physical maximum batch size
|
||||
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
int32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||
@@ -428,6 +428,13 @@ extern "C" {
|
||||
//optional:
|
||||
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
||||
|
||||
// Optional: an auto threadpool gets created in ggml if not passed explicitly
|
||||
LLAMA_API void llama_attach_threadpool(
|
||||
struct llama_context * ctx,
|
||||
ggml_threadpool_t threadpool,
|
||||
ggml_threadpool_t threadpool_batch);
|
||||
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
||||
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free(void);
|
||||
|
||||
@@ -837,13 +844,13 @@ extern "C" {
|
||||
// Set the number of threads used for decoding
|
||||
// n_threads is the number of threads used for generation (single token)
|
||||
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
||||
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
||||
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
|
||||
|
||||
// Get the number of threads used for generation of a single token.
|
||||
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
||||
LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
|
||||
|
||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||
|
||||
// Set whether the model is in embeddings mode or not
|
||||
// If true, embeddings will be returned but logits will not
|
||||
|
||||
@@ -1 +1 @@
|
||||
797faa25af14126eb30134d4033139ae3c5428ed
|
||||
28b7633d733bbeef0026570fbc61c79c5e9aa5ae
|
||||
|
||||
@@ -2373,8 +2373,8 @@ struct llama_cparams {
|
||||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
int n_threads; // number of threads to use for generation
|
||||
int n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
@@ -3091,6 +3091,9 @@ struct llama_context {
|
||||
#endif
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
|
||||
ggml_threadpool_t threadpool = nullptr;
|
||||
ggml_threadpool_t threadpool_batch = nullptr;
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
int64_t t_start_us;
|
||||
@@ -6605,6 +6608,7 @@ static bool llm_load_tensors(
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
const int64_t n_vocab_type = hparams.n_vocab_type;
|
||||
const int64_t n_rot = hparams.n_rot;
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
const int64_t n_expert_used = hparams.n_expert_used;
|
||||
const int64_t n_ctx_train = hparams.n_ctx_train;
|
||||
@@ -6662,7 +6666,7 @@ static bool llm_load_tensors(
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
|
||||
if (n_expert == 0) {
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
@@ -8115,8 +8119,8 @@ static bool llm_load_tensors(
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + (hparams.n_embd_head_k << 2)});
|
||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + (hparams.n_embd_head_k << 2)});
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
||||
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
|
||||
@@ -8193,7 +8197,7 @@ static bool llm_load_tensors(
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
@@ -15493,9 +15497,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
|
||||
}
|
||||
|
||||
static void llama_graph_compute(
|
||||
llama_context & lctx,
|
||||
ggml_cgraph * gf,
|
||||
int n_threads) {
|
||||
llama_context & lctx,
|
||||
ggml_cgraph * gf,
|
||||
int n_threads,
|
||||
ggml_threadpool * threadpool) {
|
||||
#ifdef GGML_USE_METAL
|
||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
||||
@@ -15504,6 +15509,7 @@ static void llama_graph_compute(
|
||||
|
||||
if (lctx.backend_cpu != nullptr) {
|
||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
||||
}
|
||||
#ifdef GGML_USE_BLAS
|
||||
@@ -15624,6 +15630,8 @@ static int llama_decode_internal(
|
||||
}
|
||||
|
||||
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
||||
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
|
||||
// non-causal masks do not use the KV cache
|
||||
@@ -15685,7 +15693,7 @@ static int llama_decode_internal(
|
||||
|
||||
llama_set_inputs(lctx, ubatch);
|
||||
|
||||
llama_graph_compute(lctx, gf, n_threads);
|
||||
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||
|
||||
// update the kv ring buffer
|
||||
{
|
||||
@@ -15862,7 +15870,9 @@ static int llama_encode_internal(
|
||||
lctx.inp_embd_enc = NULL;
|
||||
lctx.n_outputs = n_tokens;
|
||||
|
||||
const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
||||
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
|
||||
ggml_backend_sched_reset(lctx.sched);
|
||||
@@ -15894,7 +15904,7 @@ static int llama_encode_internal(
|
||||
|
||||
llama_set_inputs(lctx, ubatch);
|
||||
|
||||
llama_graph_compute(lctx, gf, n_threads);
|
||||
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||
|
||||
// extract embeddings
|
||||
if (embd) {
|
||||
@@ -16176,7 +16186,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
|
||||
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
#endif
|
||||
|
||||
//const int64_t t_end = ggml_time_us();
|
||||
@@ -16202,7 +16212,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||
|
||||
llama_set_k_shift(lctx);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
|
||||
need_reserve = true;
|
||||
}
|
||||
@@ -16821,7 +16831,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
if (name.find("attn_v.weight") != std::string::npos ||
|
||||
name.find("attn_qkv.weight") != std::string::npos) {
|
||||
name.find("attn_qkv.weight") != std::string::npos ||
|
||||
name.find("attn_kv_b.weight")!= std::string::npos) {
|
||||
++qs.n_attention_wv;
|
||||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
qs.has_output = true;
|
||||
@@ -17449,6 +17460,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
||||
}
|
||||
}
|
||||
|
||||
void llama_attach_threadpool(
|
||||
struct llama_context * ctx,
|
||||
ggml_threadpool_t threadpool,
|
||||
ggml_threadpool_t threadpool_batch) {
|
||||
ctx->threadpool = threadpool;
|
||||
ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
|
||||
}
|
||||
|
||||
void llama_detach_threadpool(struct llama_context * ctx) {
|
||||
ctx->threadpool = nullptr;
|
||||
ctx->threadpool_batch = nullptr;
|
||||
}
|
||||
|
||||
void llama_backend_free(void) {
|
||||
ggml_quantize_free();
|
||||
}
|
||||
@@ -19365,16 +19389,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
|
||||
}
|
||||
}
|
||||
|
||||
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
||||
void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
|
||||
ctx->cparams.n_threads = n_threads;
|
||||
ctx->cparams.n_threads_batch = n_threads_batch;
|
||||
}
|
||||
|
||||
uint32_t llama_n_threads(struct llama_context * ctx) {
|
||||
int32_t llama_n_threads(struct llama_context * ctx) {
|
||||
return ctx->cparams.n_threads;
|
||||
}
|
||||
|
||||
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||
int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||
return ctx->cparams.n_threads_batch;
|
||||
}
|
||||
|
||||
|
||||
@@ -1160,6 +1160,58 @@ struct test_sqrt : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_SIN
|
||||
struct test_sin : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_sin(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_sin(ctx, a);
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
init_tensor_uniform(t, -100.0f, 100.0f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_COS
|
||||
struct test_cos : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_cos(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_cos(ctx, a);
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
init_tensor_uniform(t, -100.0f, 100.0f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_CLAMP
|
||||
struct test_clamp : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -1731,6 +1783,27 @@ struct test_flash_attn_ext : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_CROSS_ENTROPY_LOSS
|
||||
struct test_cross_entropy_loss : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
LLM_NORM_RMS,
|
||||
@@ -2393,6 +2466,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
|
||||
test_cases.emplace_back(new test_sqr());
|
||||
test_cases.emplace_back(new test_sqrt());
|
||||
test_cases.emplace_back(new test_sin());
|
||||
test_cases.emplace_back(new test_cos());
|
||||
test_cases.emplace_back(new test_clamp());
|
||||
|
||||
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
|
||||
@@ -2512,6 +2587,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_cross_entropy_loss());
|
||||
|
||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||
#if 0
|
||||
test_cases.emplace_back(new test_llama(1));
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cassert>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@@ -217,7 +221,8 @@ static bool check_gradient(
|
||||
int nargs,
|
||||
float eps,
|
||||
float max_error_abs,
|
||||
float max_error_rel) {
|
||||
float max_error_rel,
|
||||
std::vector<double> expected_vals) {
|
||||
|
||||
static int n_threads = -1;
|
||||
if (n_threads < 0) {
|
||||
@@ -248,9 +253,10 @@ static bool check_gradient(
|
||||
// ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
|
||||
|
||||
for (int i = 0; i < nargs; ++i) {
|
||||
bool all_g0_bad = true;
|
||||
const int nelements = ggml_nelements(x[i]);
|
||||
for (int k = 0; k < nelements; ++k) {
|
||||
// compute gradient using finite differences
|
||||
// Calculate gradient numerically:
|
||||
const float x0 = ggml_get_f32_1d(x[i], k);
|
||||
const float xm = x0 - eps;
|
||||
const float xp = x0 + eps;
|
||||
@@ -267,6 +273,28 @@ static bool check_gradient(
|
||||
const double f1 = ggml_get_f32_1d(f, 0);
|
||||
const double g0 = (f0 - f1)/(2.0*(double) eps);
|
||||
|
||||
// The numerical calculation of the gradient fails around noncontinuities (e.g. 0 for ReLU).
|
||||
// In such cases, provide a vector of expected values and skip the comparison for failed calculations.
|
||||
if (!expected_vals.empty()) {
|
||||
bool matches_any = false;
|
||||
for (const double & ev : expected_vals) {
|
||||
const double error_abs = std::fabs(g0 - ev);
|
||||
if (error_abs > max_error_abs) {
|
||||
continue;
|
||||
}
|
||||
const double error_rel = g0 != 0.0 ? fabs(g0 - ev)/fabs(g0) : 0.0;
|
||||
if (error_rel > max_error_rel) {
|
||||
continue;
|
||||
}
|
||||
matches_any = true;
|
||||
break;
|
||||
}
|
||||
if (!matches_any) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
all_g0_bad = false;
|
||||
|
||||
ggml_set_f32_1d(x[i], k, x0);
|
||||
|
||||
// compute gradient using backward graph
|
||||
@@ -278,7 +306,7 @@ static bool check_gradient(
|
||||
const double g1 = ggml_get_f32_1d(x[i]->grad, k);
|
||||
|
||||
const double error_abs = fabs(g0 - g1);
|
||||
const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
|
||||
const double error_rel = g0 != 0.0 ? fabs(g0 - g1)/fabs(g0) : 0.0;
|
||||
|
||||
if (error_abs > max_error_abs || error_rel > max_error_rel) {
|
||||
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
|
||||
@@ -287,6 +315,10 @@ static bool check_gradient(
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (all_g0_bad) {
|
||||
printf("%s: numerical calculation of the gradient failed for all values\n", op_name);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -404,7 +436,7 @@ int main(int argc, const char ** argv) {
|
||||
seed_iter = rand();
|
||||
unsigned seed = rand();
|
||||
|
||||
printf("test-grad0: iter:%d/%d\n", iter, niter);
|
||||
printf("test-grad0: iter:%d/%d\n", (iter+1), niter);
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
get_random_dims(ne, 4);
|
||||
@@ -424,7 +456,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
|
||||
|
||||
check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
|
||||
check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -441,7 +473,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
|
||||
|
||||
check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
|
||||
check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -458,7 +490,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
|
||||
|
||||
check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -475,7 +507,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
|
||||
|
||||
check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -492,7 +524,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
|
||||
|
||||
check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
|
||||
check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -509,7 +541,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
|
||||
|
||||
check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -526,7 +558,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
|
||||
|
||||
check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
|
||||
check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -543,7 +575,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
|
||||
|
||||
check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
|
||||
check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -560,7 +592,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
|
||||
|
||||
check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -578,7 +610,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
|
||||
|
||||
check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
|
||||
check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -596,7 +628,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
|
||||
|
||||
check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -614,7 +646,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
|
||||
|
||||
check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -637,7 +669,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
|
||||
|
||||
check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
|
||||
check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -660,25 +692,25 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
|
||||
|
||||
check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
|
||||
check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
// abs (finite differences do not work)
|
||||
//{
|
||||
// const int nargs = 1;
|
||||
// abs
|
||||
{
|
||||
const int nargs = 1;
|
||||
|
||||
// for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||
// for (int i = 0; i < nargs; ++i) {
|
||||
// x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
||||
// ggml_set_param(ctx0, x[i]);
|
||||
// }
|
||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||
for (int i = 0; i < nargs; ++i) {
|
||||
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
||||
ggml_set_param(ctx0, x[i]);
|
||||
}
|
||||
|
||||
// struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
|
||||
|
||||
// check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
|
||||
// }
|
||||
//}
|
||||
check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f, {-1.0, 1.0});
|
||||
}
|
||||
}
|
||||
|
||||
// sgn
|
||||
{
|
||||
@@ -693,7 +725,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
|
||||
|
||||
check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -710,7 +742,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
|
||||
|
||||
check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -727,7 +759,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
|
||||
|
||||
check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -745,7 +777,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
|
||||
|
||||
check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -776,7 +808,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
|
||||
|
||||
check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
if (ndims == 2) {
|
||||
// check_mat_mul does not support ndims > 2
|
||||
check_mat_mul(m, x[1], x[0]);
|
||||
@@ -800,7 +832,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
|
||||
|
||||
check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -817,7 +849,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
|
||||
|
||||
check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {0.0, 1.0});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -835,7 +867,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
|
||||
|
||||
check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
|
||||
check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -854,9 +886,9 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
#ifdef GGML_SILU_FP16
|
||||
// due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
|
||||
check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
|
||||
check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY, {});
|
||||
#else
|
||||
check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -874,7 +906,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
|
||||
|
||||
check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
|
||||
check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -892,7 +924,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
|
||||
|
||||
check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -910,7 +942,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
|
||||
|
||||
check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -928,7 +960,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
|
||||
|
||||
check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
|
||||
check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -952,7 +984,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
|
||||
check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -976,7 +1008,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
|
||||
check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1004,7 +1036,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
|
||||
|
||||
check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1037,7 +1069,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
|
||||
|
||||
check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1072,7 +1104,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
|
||||
|
||||
check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1109,7 +1141,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
|
||||
|
||||
check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1137,7 +1169,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
|
||||
|
||||
check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1170,7 +1202,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
|
||||
|
||||
check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1194,7 +1226,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
|
||||
|
||||
check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1225,7 +1257,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
|
||||
|
||||
check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1257,7 +1289,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
|
||||
|
||||
check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1291,7 +1323,7 @@ int main(int argc, const char ** argv) {
|
||||
// sum requires contiguous tensor rows
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
|
||||
|
||||
check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1319,7 +1351,7 @@ int main(int argc, const char ** argv) {
|
||||
// sum requires contiguous tensor rows
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
|
||||
|
||||
check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1337,7 +1369,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
|
||||
|
||||
check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
|
||||
// diag_mask_inf
|
||||
@@ -1353,7 +1385,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
|
||||
|
||||
check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
|
||||
// diag_mask_zero
|
||||
@@ -1369,7 +1401,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
|
||||
|
||||
check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
|
||||
// softmax
|
||||
@@ -1395,7 +1427,7 @@ int main(int argc, const char ** argv) {
|
||||
1.0f - eps),
|
||||
ggml_new_f32(ctx0, eps))));
|
||||
|
||||
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
|
||||
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY, {});
|
||||
// NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
|
||||
// this may result in different gradients too finite differences.
|
||||
// when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
|
||||
@@ -1412,7 +1444,7 @@ int main(int argc, const char ** argv) {
|
||||
get_random_dims(ne2, 4);
|
||||
|
||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||
x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
|
||||
// the second argument to cross_entropy_loss must sum up to 1 for each row
|
||||
int nr = ggml_nrows(x[1]);
|
||||
@@ -1430,7 +1462,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
|
||||
|
||||
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
|
||||
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1468,7 +1500,7 @@ int main(int argc, const char ** argv) {
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
|
||||
|
||||
GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
|
||||
check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
|
||||
check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1508,12 +1540,93 @@ int main(int argc, const char ** argv) {
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
|
||||
|
||||
GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
|
||||
check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
|
||||
check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// im2col f32
|
||||
{
|
||||
srand(seed);
|
||||
const int nargs = 1;
|
||||
const int ndims = 4;
|
||||
|
||||
for (const bool is_2D : {false, true}) {
|
||||
int64_t ne0[ndims];
|
||||
int64_t ne1[ndims];
|
||||
get_random_dims(ne0, ndims);
|
||||
get_random_dims(ne1, ndims);
|
||||
|
||||
// // Ensure that the output is not zero-sized:
|
||||
ne1[0] += 8;
|
||||
ne1[1] += 8;
|
||||
|
||||
if (is_2D) {
|
||||
ne1[2] = ne0[2];
|
||||
} else {
|
||||
ne1[1] = ne0[1];
|
||||
ne0[3] = 1;
|
||||
ne1[3] = 1;
|
||||
}
|
||||
|
||||
// The order of arguments is swapped because the first tensor is only used for its shape.
|
||||
x[1] = get_random_tensor_f16(ctx0, ndims, ne0, -1.0f, 1.0f);
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne1, -1.0f, 1.0f);
|
||||
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
|
||||
const int s0 = 1 + irand(2);
|
||||
const int s1 = is_2D ? 1 + irand(2) : 0;
|
||||
const int p0 = 0 + irand(2);
|
||||
const int p1 = is_2D ? 0 + irand(2) : 0;
|
||||
const int d0 = 1 + irand(2);
|
||||
const int d1 = is_2D ? 1 + irand(2) : 0;
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_im2col(ctx0, x[1], x[0], s0, s1, p0, p1, d0, d1, is_2D, GGML_TYPE_F32));
|
||||
|
||||
GGML_PRINT_DEBUG("im2col f32: is_2D=%s, s0=%d, s1=%d, p0=%d, p1=%d, d0=%d, d1=%d\n", is_2D ? "yes" : "no", s0, s1, p0, p1, d0, d1);
|
||||
check_gradient("im2col f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
|
||||
}
|
||||
}
|
||||
|
||||
// pool_2d f32
|
||||
{
|
||||
srand(seed);
|
||||
const int nargs = 1;
|
||||
const int ndims = 4;
|
||||
|
||||
for (const enum ggml_op_pool op : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
|
||||
int64_t ne0[ndims];
|
||||
get_random_dims(ne0, ndims);
|
||||
|
||||
ne0[0] += 8;
|
||||
ne0[1] += 8;
|
||||
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne0, -1.0f, 1.0f);
|
||||
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
|
||||
const int k0 = 2 + irand(2);
|
||||
const int k1 = 2 + irand(2);
|
||||
const int s0 = 2 + irand(2);
|
||||
const int s1 = 2 + irand(2);
|
||||
const int p0 = 0 + irand(2);
|
||||
const int p1 = 0 + irand(2);
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_pool_2d(ctx0, x[0], op, k0, k1, s0, s1, p0, p1));
|
||||
|
||||
GGML_PRINT_DEBUG("ggml_pool_2d f32: op=%s k0=%d, k1=%d, s0=%d, s1=%d, p0=%d, p1=%d\n",
|
||||
op == GGML_OP_POOL_MAX ? "max" : "avg", k0, k1, s0, s1, p0, p1);
|
||||
std::vector<double> expected_vals;
|
||||
if (op == GGML_OP_POOL_MAX) {
|
||||
expected_vals.push_back(0.0);
|
||||
expected_vals.push_back(1.0);
|
||||
}
|
||||
check_gradient("ggml_pool_2d f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, expected_vals);
|
||||
}
|
||||
}
|
||||
|
||||
// flash_attn f32
|
||||
// TODO: adapt to ggml_flash_attn_ext() changes
|
||||
//{
|
||||
@@ -1553,7 +1666,7 @@ int main(int argc, const char ** argv) {
|
||||
|
||||
// struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||
|
||||
// check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||
// check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY, {});
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
@@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
|
||||
}
|
||||
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
|
||||
Reference in New Issue
Block a user