mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-06 00:54:06 +00:00
Compare commits
18 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
edd1ab7bc3 | ||
|
|
198ed7ebfc | ||
|
|
d836174731 | ||
|
|
06f2a5d190 | ||
|
|
c5239944ba | ||
|
|
1e9ae54cf2 | ||
|
|
7adedecbe3 | ||
|
|
356ea17e0f | ||
|
|
a5c088d8c6 | ||
|
|
1e3900ebac | ||
|
|
e39106c055 | ||
|
|
9fbda719de | ||
|
|
39d8bc71ed | ||
|
|
24a447e20a | ||
|
|
a20f3c7465 | ||
|
|
0235b9b571 | ||
|
|
ce18d727a4 | ||
|
|
91bb39cec7 |
@@ -8,12 +8,13 @@
|
||||
pkgsCuda,
|
||||
...
|
||||
}:
|
||||
lib.optionalAttrs (system == "aarch64-linux") {
|
||||
packages =
|
||||
{
|
||||
legacyPackages =
|
||||
let
|
||||
caps.jetson-xavier = "7.2";
|
||||
caps.jetson-orin = "8.7";
|
||||
caps.jetson-nano = "5.3";
|
||||
caps.llamaPackagesXavier = "7.2";
|
||||
caps.llamaPackagesOrin = "8.7";
|
||||
caps.llamaPackagesTX2 = "6.2";
|
||||
caps.llamaPackagesNano = "5.3";
|
||||
|
||||
pkgsFor =
|
||||
cap:
|
||||
@@ -27,6 +28,12 @@
|
||||
};
|
||||
};
|
||||
in
|
||||
builtins.mapAttrs (name: cap: ((pkgsFor cap).callPackage ./scope.nix { }).llama-cpp) caps;
|
||||
builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
|
||||
|
||||
packages = lib.optionalAttrs (system == "aarch64-linux") {
|
||||
jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
|
||||
jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
|
||||
jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
1
.github/workflows/build.yml
vendored
1
.github/workflows/build.yml
vendored
@@ -515,7 +515,6 @@ jobs:
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
# steps:
|
||||
|
||||
112
.github/workflows/nix-ci.yml
vendored
Normal file
112
.github/workflows/nix-ci.yml
vendored
Normal file
@@ -0,0 +1,112 @@
|
||||
name: Nix CI
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
|
||||
jobs:
|
||||
nix-eval:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ ubuntu-latest, macos-latest ]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: List all flake outputs
|
||||
run: nix flake show --all-systems
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
nix-build:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ ubuntu-latest, macos-latest ]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--flake
|
||||
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
nix-build-aarch64:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install QEMU
|
||||
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
||||
run: |
|
||||
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
||||
sudo usermod -a -G kvm $USER
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-platforms = aarch64-linux
|
||||
extra-system-features = nixos-test kvm
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.aarch64-linux"
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--systems aarch64-linux
|
||||
--flake
|
||||
".#checks.aarch64-linux"
|
||||
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
name: update-flake-lock
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
||||
|
||||
jobs:
|
||||
lockfile:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@main
|
||||
- name: Update flake.lock
|
||||
uses: DeterminateSystems/update-flake-lock@main
|
||||
with:
|
||||
pr-title: "nix: update flake.lock"
|
||||
pr-labels: |
|
||||
nix
|
||||
pr-reviewers: philiptaron,SomeoneSerge
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
23
.github/workflows/nix-flakestry.yml
vendored
23
.github/workflows/nix-flakestry.yml
vendored
@@ -1,23 +0,0 @@
|
||||
# Make the flake discoverable on https://flakestry.dev
|
||||
name: "Publish a flake to flakestry"
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v?[0-9]+.[0-9]+.[0-9]+"
|
||||
- "v?[0-9]+.[0-9]+"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "The existing tag to publish"
|
||||
type: "string"
|
||||
required: true
|
||||
jobs:
|
||||
publish-flake:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: flakestry/flakestry-publish@main
|
||||
with:
|
||||
version: "${{ inputs.tag || github.ref_name }}"
|
||||
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
||||
name: "Publish a flake to flakestry & flakehub"
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "*"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "The existing tag to publish"
|
||||
type: "string"
|
||||
required: true
|
||||
jobs:
|
||||
flakestry-publish:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: flakestry/flakestry-publish@main
|
||||
with:
|
||||
version: "${{ inputs.tag || github.ref_name }}"
|
||||
flakehub-publish:
|
||||
runs-on: "ubuntu-latest"
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: "actions/checkout@v4"
|
||||
with:
|
||||
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
||||
- uses: "DeterminateSystems/nix-installer-action@main"
|
||||
- uses: "DeterminateSystems/flakehub-push@main"
|
||||
with:
|
||||
visibility: "public"
|
||||
tag: "${{ inputs.tag }}"
|
||||
30
README.md
30
README.md
@@ -385,16 +385,30 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
Check [BLIS.md](docs/BLIS.md) for more information.
|
||||
|
||||
- #### Intel MKL
|
||||
- #### Intel oneMKL
|
||||
- Using manual oneAPI installation:
|
||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-runtime docker image, only required for manual installation
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
|
||||
- Using oneAPI docker image:
|
||||
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build . --config Release
|
||||
```
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
|
||||
|
||||
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
||||
|
||||
- #### cuBLAS
|
||||
|
||||
|
||||
@@ -1394,6 +1394,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
|
||||
@@ -24,7 +24,8 @@ endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(llava BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -16,12 +16,19 @@
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
#define CLIP_DEBUG
|
||||
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
@@ -139,6 +146,27 @@ static std::string get_ftype(int ftype) {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// image data
|
||||
//
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
|
||||
// RGB float32 image (NHWC)
|
||||
// Memory layout: RGBRGBRGB...
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
//
|
||||
// clip layers
|
||||
//
|
||||
@@ -196,39 +224,31 @@ struct clip_vision_model {
|
||||
struct ggml_tensor * mm_2_b;
|
||||
};
|
||||
|
||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
||||
struct clip_buffer {
|
||||
uint8_t * data = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
void resize(size_t size) {
|
||||
delete[] data;
|
||||
data = new uint8_t[size];
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
~clip_buffer() { delete[] data; }
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
bool has_llava_projector = false;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
int32_t ftype = 1;
|
||||
struct ggml_context * ctx;
|
||||
|
||||
struct gguf_context * ctx_gguf;
|
||||
struct ggml_context * ctx_data;
|
||||
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
clip_buffer buf_compute;
|
||||
clip_buffer buf_alloc;
|
||||
ggml_allocr * alloc = NULL;
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
ggml_backend_buffer_t compute_buffer = NULL;
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_allocr * compute_alloc = NULL;
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
@@ -249,28 +269,24 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
//const int projection_dim = hparams.projection_dim;
|
||||
const float eps = hparams.eps;
|
||||
int batch_size = imgs->size;
|
||||
if(ctx->has_llava_projector) {
|
||||
if (ctx->has_llava_projector) {
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
}
|
||||
|
||||
const auto & buf_compute = ctx->buf_compute;
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.data,
|
||||
/*.no_alloc =*/ false,
|
||||
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
params.no_alloc = true;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
|
||||
ggml_allocr_alloc(ctx->alloc, inp_raw);
|
||||
ggml_allocr_alloc(ctx->compute_alloc, inp_raw);
|
||||
|
||||
if (!ggml_allocr_is_measure(ctx->alloc)) {
|
||||
float * data = (float *)ggml_get_data(inp_raw);
|
||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||
float * data = (float *)malloc(ggml_nbytes(inp_raw));
|
||||
|
||||
for (size_t i = 0; i < imgs->size; i++) {
|
||||
const int nx = imgs->data[i].nx;
|
||||
@@ -283,12 +299,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
for (int k = 0; k < 3; k++) {
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
|
||||
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
||||
free(data);
|
||||
}
|
||||
|
||||
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
@@ -298,36 +316,39 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
|
||||
// concat class_embeddings and patch_embeddings
|
||||
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
ggml_allocr_alloc(ctx->alloc, embeddings);
|
||||
if (!ggml_allocr_is_measure(ctx->alloc)) {
|
||||
ggml_set_zero(embeddings);
|
||||
ggml_allocr_alloc(ctx->compute_alloc, embeddings);
|
||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
||||
memset(zero_mem, 0, ggml_nbytes(embeddings));
|
||||
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
||||
free(zero_mem);
|
||||
}
|
||||
|
||||
struct ggml_tensor * temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
|
||||
ggml_allocr_alloc(ctx->alloc, temp);
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
|
||||
embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1],
|
||||
embeddings->nb[2], embeddings->nb[3], 0);
|
||||
embeddings =
|
||||
ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
ggml_allocr_alloc(ctx->alloc, positions);
|
||||
if (!ggml_allocr_is_measure(ctx->alloc)) {
|
||||
ggml_allocr_alloc(ctx->compute_alloc, positions);
|
||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
ggml_set_i32_1d(positions, i, i);
|
||||
positions_data[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
|
||||
embeddings =
|
||||
ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings));
|
||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||
|
||||
// pre-layernorm
|
||||
{
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.pre_ln_w, embeddings), embeddings),
|
||||
ggml_repeat(ctx0, model.pre_ln_b, embeddings));
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
@@ -340,15 +361,15 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
{
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
|
||||
model.layers[il].ln_1_b);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
|
||||
struct ggml_tensor * Q =
|
||||
ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur));
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
||||
|
||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
|
||||
@@ -356,14 +377,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
||||
|
||||
struct ggml_tensor * K =
|
||||
ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), ggml_mul_mat(ctx0, model.layers[il].k_w, cur));
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
||||
|
||||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
||||
|
||||
struct ggml_tensor * V =
|
||||
ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), ggml_mul_mat(ctx0, model.layers[il].v_w, cur));
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
|
||||
|
||||
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
||||
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||
@@ -379,7 +400,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
}
|
||||
|
||||
// attention output
|
||||
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].o_b, cur), ggml_mul_mat(ctx0, model.layers[il].o_w, cur));
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, embeddings);
|
||||
@@ -390,12 +411,11 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
{
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
|
||||
}
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
|
||||
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
|
||||
|
||||
if (ctx->use_gelu) {
|
||||
cur = ggml_gelu_inplace(ctx0, cur);
|
||||
@@ -404,7 +424,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
}
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
|
||||
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, embeddings, cur);
|
||||
@@ -417,23 +437,26 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||
|
||||
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
||||
ggml_allocr_alloc(ctx->alloc, patches);
|
||||
if (!ggml_allocr_is_measure(ctx->alloc)) {
|
||||
for (int i = 0; i < num_patches; ++i) {
|
||||
ggml_set_i32_1d(patches, i, i+1);
|
||||
ggml_allocr_alloc(ctx->compute_alloc, patches);
|
||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
patches_data[i] = i + 1;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
}
|
||||
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, patches);
|
||||
|
||||
// mm projection 0
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_0_b, embeddings), embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_2_b, embeddings), embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
@@ -446,7 +469,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
|
||||
struct ggml_context * meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
@@ -479,7 +501,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
printf("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
// kv
|
||||
if (verbosity >= 3) {
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
@@ -493,27 +515,38 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
|
||||
// data
|
||||
size_t ctx_size = 0;
|
||||
size_t buffer_size = 0;
|
||||
{
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
|
||||
struct ggml_tensor * cur = ggml_get_tensor(meta, name);
|
||||
ctx_size += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
||||
size_t tensor_size = ggml_nbytes(cur);
|
||||
size_t padded_size = ggml_nbytes_pad(cur);
|
||||
ctx_size += padded_size;
|
||||
buffer_size += tensor_size;
|
||||
if (verbosity >= 3) {
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
|
||||
ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
|
||||
ggml_n_dims(cur), cur->name, tensor_size, offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buffer_size += n_tensors * 128 /* CLIP PADDING */;
|
||||
|
||||
clip_ctx * new_clip = new clip_ctx;
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
new_clip->backend = ggml_backend_cuda_init(0);
|
||||
printf("%s: CLIP using CUDA backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
new_clip->backend = ggml_backend_metal_init();
|
||||
printf("%s: CLIP using Metal backend\n", __func__);
|
||||
#endif
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
printf("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
// model size and capabilities
|
||||
{
|
||||
@@ -539,21 +572,24 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||
printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||
printf("%s: model size: %.2f MB\n", __func__, (ctx_size / 1024.0 / 1024.0));
|
||||
printf("%s: model size: %.2f MB\n", __func__, buffer_size / 1024.0 / 1024.0);
|
||||
printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, buffer_size / (1024.0 * 1024.0), n_tensors);
|
||||
|
||||
// load tensors
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
new_clip->ctx = ggml_init(params);
|
||||
if (!new_clip->ctx) {
|
||||
new_clip->ctx_data = ggml_init(params);
|
||||
if (!new_clip->ctx_data) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
clip_free(new_clip);
|
||||
return nullptr;
|
||||
@@ -566,13 +602,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
// add tensors to context
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * t = ggml_get_tensor(meta, name);
|
||||
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
|
||||
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
|
||||
ggml_set_name(cur, name);
|
||||
}
|
||||
|
||||
// alloc memory and offload data
|
||||
new_clip->params_buffer = ggml_backend_alloc_buffer(new_clip->backend, buffer_size);
|
||||
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
||||
ggml_allocr_alloc(alloc, cur);
|
||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
if (!fin) {
|
||||
@@ -580,10 +624,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
clip_free(new_clip);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char *>(cur->data), ggml_nbytes(t));
|
||||
int num_bytes = ggml_nbytes(cur);
|
||||
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
|
||||
// for the CPU and Metal backend, we can read directly into the tensor
|
||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||
} else {
|
||||
// read into a temporary buffer first, then copy to device memory
|
||||
read_buf.resize(num_bytes);
|
||||
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_allocr_free(alloc);
|
||||
fin.close();
|
||||
}
|
||||
|
||||
@@ -592,20 +644,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
// load vision model
|
||||
auto & vision_model = new_clip->vision_model;
|
||||
auto & hparams = vision_model.hparams;
|
||||
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
|
||||
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
|
||||
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
|
||||
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
|
||||
hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
|
||||
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
|
||||
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
|
||||
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
|
||||
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
|
||||
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
|
||||
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
|
||||
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
|
||||
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
|
||||
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
|
||||
|
||||
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
|
||||
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
|
||||
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
|
||||
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
|
||||
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
|
||||
}
|
||||
|
||||
if (verbosity >= 2) {
|
||||
@@ -619,35 +671,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
printf("v_n_layer %d\n", hparams.n_layer);
|
||||
}
|
||||
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
|
||||
vision_model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
auto & layer = vision_model.layers[il];
|
||||
layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
|
||||
layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
|
||||
layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
|
||||
layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
||||
layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
|
||||
layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
|
||||
layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
|
||||
layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
|
||||
layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
|
||||
layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
|
||||
layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
|
||||
layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
|
||||
layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
|
||||
layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
|
||||
layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
|
||||
layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
|
||||
layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
|
||||
layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
|
||||
layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
|
||||
layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
||||
layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
|
||||
layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
|
||||
layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
|
||||
layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
|
||||
layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
|
||||
layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
|
||||
layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
|
||||
layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
|
||||
layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
|
||||
layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
|
||||
layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
|
||||
layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -655,45 +707,45 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
|
||||
new_clip->ctx_gguf = ctx;
|
||||
|
||||
// measure mem requirement and allocate
|
||||
// measure mem requirement and allocate
|
||||
{
|
||||
static const size_t tensor_alignment = 32;
|
||||
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
|
||||
new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
|
||||
clip_image_f32_batch batch;
|
||||
batch.size = 1;
|
||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
||||
size_t alloc_size = ggml_allocr_alloc_graph(new_clip->alloc, gf) + tensor_alignment;
|
||||
ggml_allocr_free(new_clip->alloc);
|
||||
new_clip->buf_alloc.resize(alloc_size);
|
||||
new_clip->alloc = ggml_allocr_new(new_clip->buf_alloc.data, new_clip->buf_alloc.size, tensor_alignment);
|
||||
size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_clip->compute_alloc, gf);
|
||||
ggml_allocr_free(new_clip->compute_alloc);
|
||||
new_clip->compute_buffer = ggml_backend_alloc_buffer(new_clip->backend, compute_memory_buffer_size);
|
||||
new_clip->compute_alloc = ggml_allocr_new_from_buffer(new_clip->compute_buffer);
|
||||
|
||||
printf("%s: total allocated memory: %.2f MB\n", __func__, (new_clip->buf_compute.size + alloc_size)/1024.0/1024.0);
|
||||
printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
}
|
||||
|
||||
return new_clip;
|
||||
}
|
||||
|
||||
clip_image_u8 * make_clip_image_u8() {
|
||||
auto img = new clip_image_u8();
|
||||
return img;
|
||||
struct clip_image_u8 * clip_image_u8_init() {
|
||||
return new clip_image_u8();
|
||||
}
|
||||
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
|
||||
|
||||
void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
|
||||
void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
|
||||
struct clip_image_f32 * clip_image_f32_init() {
|
||||
return new clip_image_f32();
|
||||
}
|
||||
|
||||
void clip_image_u8_free (struct clip_image_u8 * img) { delete img; }
|
||||
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
||||
|
||||
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
||||
img->nx = nx;
|
||||
img->ny = ny;
|
||||
img->size = nx * ny * 3;
|
||||
img->data = new uint8_t[img->size]();
|
||||
memcpy(img->data, data, img->size);
|
||||
img->buf.resize(3 * nx * ny);
|
||||
memcpy(img->buf.data(), data, img->buf.size());
|
||||
}
|
||||
|
||||
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||
int nx, ny, nc;
|
||||
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
|
||||
return false;
|
||||
@@ -705,7 +757,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||
|
||||
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
||||
int nx, ny, nc;
|
||||
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
|
||||
return false;
|
||||
@@ -717,7 +769,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
||||
|
||||
// normalize: x = (x - mean) / std
|
||||
// TODO: implement bicubic interpolation instead of linear.
|
||||
bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
@@ -726,18 +778,17 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
||||
|
||||
clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
|
||||
clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
|
||||
if (pad2square && img->nx != img->ny) {
|
||||
int longer_side = std::max(img->nx, img->ny);
|
||||
temp->nx = longer_side;
|
||||
temp->ny = longer_side;
|
||||
temp->size = 3 * longer_side * longer_side;
|
||||
temp->data = new uint8_t[temp->size]();
|
||||
uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
||||
temp->buf.resize(3 * longer_side * longer_side);
|
||||
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
||||
|
||||
// fill with background color
|
||||
for (size_t i = 0; i < temp->size; i++) {
|
||||
temp->data[i] = bc[i % 3];
|
||||
for (size_t i = 0; i < temp->buf.size(); i++) {
|
||||
temp->buf[i] = bc[i % 3];
|
||||
}
|
||||
|
||||
// copy from the input image
|
||||
@@ -745,17 +796,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
for (int x = 0; x < img->nx; x++) {
|
||||
const int i = 3 * (y * img->nx + x);
|
||||
const int j = 3 * (y * temp->nx + x);
|
||||
temp->data[j] = img->data[i];
|
||||
temp->data[j+1] = img->data[i+1];
|
||||
temp->data[j+2] = img->data[i+2];
|
||||
temp->buf[j] = img->buf[i];
|
||||
temp->buf[j+1] = img->buf[i+1];
|
||||
temp->buf[j+2] = img->buf[i+2];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
temp->nx = img->nx;
|
||||
temp->ny = img->ny;
|
||||
temp->size = img->size;
|
||||
temp->data = new uint8_t[temp->size]();
|
||||
memcpy(&temp->data[0], &img->data[0], temp->size); // copy
|
||||
temp->nx = img->nx;
|
||||
temp->ny = img->ny;
|
||||
temp->buf.resize(img->buf.size());
|
||||
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
|
||||
}
|
||||
|
||||
const int nx = temp->nx;
|
||||
@@ -766,8 +816,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
|
||||
res->nx = nx2;
|
||||
res->ny = ny2;
|
||||
res->size = 3 * nx2 * ny2;
|
||||
res->data = new float[res->size]();
|
||||
res->buf.resize(3 * nx2 * ny2);
|
||||
|
||||
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
|
||||
|
||||
@@ -798,10 +847,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
const int j10 = 3 * (y1 * nx + x0) + c;
|
||||
const int j11 = 3 * (y1 * nx + x1) + c;
|
||||
|
||||
const float v00 = temp->data[j00];
|
||||
const float v01 = temp->data[j01];
|
||||
const float v10 = temp->data[j10];
|
||||
const float v11 = temp->data[j11];
|
||||
const float v00 = temp->buf[j00];
|
||||
const float v01 = temp->buf[j01];
|
||||
const float v10 = temp->buf[j10];
|
||||
const float v11 = temp->buf[j11];
|
||||
|
||||
const float v0 = v00 * (1.0f - dx) + v01 * dx;
|
||||
const float v1 = v10 * (1.0f - dx) + v11 * dx;
|
||||
@@ -812,7 +861,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
|
||||
const int i = 3 * (y * nx3 + x) + c;
|
||||
|
||||
res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
|
||||
res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -822,12 +871,13 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
ggml_free(ctx->ctx);
|
||||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
@@ -839,8 +889,7 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32
|
||||
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
|
||||
}
|
||||
|
||||
bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||
|
||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
@@ -852,29 +901,29 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
|
||||
}
|
||||
|
||||
// reset alloc buffer to clean the memory from previous invocations
|
||||
ggml_allocr_reset(ctx->alloc);
|
||||
ggml_allocr_reset(ctx->compute_alloc);
|
||||
|
||||
// build the inference graph
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
||||
ggml_allocr_alloc_graph(ctx->alloc, gf);
|
||||
ggml_allocr_alloc_graph(ctx->compute_alloc, gf);
|
||||
|
||||
struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = (uint8_t *)malloc(plan.work_size);
|
||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||
}
|
||||
|
||||
ggml_graph_compute(gf, &plan);
|
||||
#ifdef GGML_USE_METAL
|
||||
if (ggml_backend_is_metal(ctx->backend)) {
|
||||
ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
|
||||
// the last node is the embedding tensor
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||
|
||||
// copy the embeddings to the location passed by the user
|
||||
memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings));
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -883,31 +932,32 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
ggml_type type = GGML_TYPE_Q4_1;
|
||||
|
||||
switch (itype) {
|
||||
case 2:
|
||||
type = GGML_TYPE_Q4_0;
|
||||
break;
|
||||
case 3:
|
||||
type = GGML_TYPE_Q4_1;
|
||||
break;
|
||||
case 6:
|
||||
type = GGML_TYPE_Q5_0;
|
||||
break;
|
||||
case 7:
|
||||
type = GGML_TYPE_Q5_1;
|
||||
break;
|
||||
case 8:
|
||||
type = GGML_TYPE_Q8_0;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
|
||||
return false;
|
||||
case 2:
|
||||
type = GGML_TYPE_Q4_0;
|
||||
break;
|
||||
case 3:
|
||||
type = GGML_TYPE_Q4_1;
|
||||
break;
|
||||
case 6:
|
||||
type = GGML_TYPE_Q5_0;
|
||||
break;
|
||||
case 7:
|
||||
type = GGML_TYPE_Q5_1;
|
||||
break;
|
||||
case 8:
|
||||
type = GGML_TYPE_Q8_0;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
|
||||
return false;
|
||||
};
|
||||
|
||||
auto ctx_clip = clip_model_load(fname_inp, 2);
|
||||
const auto & ctx_src = ctx_clip->ctx_gguf;
|
||||
const auto & ctx_data = ctx_clip->ctx;
|
||||
auto * ctx_clip = clip_model_load(fname_inp, 2);
|
||||
|
||||
auto ctx_out = gguf_init_empty();
|
||||
const auto & ctx_src = ctx_clip->ctx_gguf;
|
||||
const auto & ctx_data = ctx_clip->ctx_data;
|
||||
|
||||
auto * ctx_out = gguf_init_empty();
|
||||
gguf_set_kv(ctx_out, ctx_src);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", itype);
|
||||
@@ -1045,8 +1095,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
gguf_free(ctx_out);
|
||||
|
||||
{
|
||||
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||
|
||||
int64_t sum_all = 0;
|
||||
for (size_t i = 0; i < hist_all.size(); ++i) {
|
||||
|
||||
@@ -35,31 +35,14 @@ struct clip_vision_hparams {
|
||||
float eps;
|
||||
};
|
||||
|
||||
/** load mmproj model */
|
||||
CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
|
||||
/** free mmproj model */
|
||||
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
|
||||
|
||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
int clip_n_patches(const struct clip_ctx * ctx);
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
uint8_t * data = NULL;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// RGB float32 image (NHWC)
|
||||
// Memory layout: RGBRGBRGB...
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
float * data = NULL;
|
||||
size_t size;
|
||||
};
|
||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
struct clip_image_u8_batch {
|
||||
struct clip_image_u8 * data;
|
||||
@@ -71,21 +54,22 @@ struct clip_image_f32_batch {
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct clip_image_u8 * make_clip_image_u8();
|
||||
struct clip_image_f32 * make_clip_image_f32();
|
||||
CLIP_API void clip_image_u8_free(clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(clip_image_f32 * img);
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
|
||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
|
||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
|
||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
|
||||
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
|
||||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
|
||||
bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
|
||||
float * vec);
|
||||
|
||||
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
|
||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "base64.hpp"
|
||||
|
||||
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
||||
clip_image_f32 * img_res = make_clip_image_f32();
|
||||
clip_image_f32 * img_res = clip_image_f32_init();
|
||||
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
|
||||
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
||||
clip_image_f32_free(img_res);
|
||||
@@ -86,7 +86,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
}
|
||||
|
||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
|
||||
clip_image_u8 * img = make_clip_image_u8();
|
||||
clip_image_u8 * img = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
||||
clip_image_u8_free(img);
|
||||
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
|
||||
|
||||
@@ -82,7 +82,7 @@ static inline bool is_base64(uint8_t c)
|
||||
return (isalnum(c) || (c == '+') || (c == '/'));
|
||||
}
|
||||
|
||||
static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
|
||||
static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
|
||||
{
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
@@ -209,10 +209,10 @@ struct slot_image
|
||||
int32_t id;
|
||||
|
||||
bool request_encode_image = false;
|
||||
float* image_embedding = nullptr;
|
||||
float * image_embedding = nullptr;
|
||||
int32_t image_tokens = 0;
|
||||
|
||||
clip_image_u8 img_data;
|
||||
clip_image_u8 * img_data;
|
||||
|
||||
std::string prefix_prompt; // before of this image
|
||||
};
|
||||
@@ -434,10 +434,12 @@ struct llama_client_slot
|
||||
|
||||
generated_token_probs.clear();
|
||||
|
||||
for (slot_image &img : images)
|
||||
for (slot_image & img : images)
|
||||
{
|
||||
free(img.image_embedding);
|
||||
delete[] img.img_data.data;
|
||||
if (img.img_data) {
|
||||
clip_image_u8_free(img.img_data);
|
||||
}
|
||||
img.prefix_prompt = "";
|
||||
}
|
||||
|
||||
@@ -851,24 +853,17 @@ struct llama_server_context
|
||||
{
|
||||
for (const auto &img : *images_data)
|
||||
{
|
||||
std::string data_b64 = img["data"].get<std::string>();
|
||||
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
||||
|
||||
slot_image img_sl;
|
||||
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
|
||||
int width, height, channels;
|
||||
std::vector<uint8_t> image_buffer = base64_decode(data_b64);
|
||||
data_b64.clear();
|
||||
auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
|
||||
if (!data) {
|
||||
img_sl.img_data = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||
{
|
||||
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
|
||||
return false;
|
||||
}
|
||||
LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
|
||||
img_sl.img_data.nx = width;
|
||||
img_sl.img_data.ny = height;
|
||||
img_sl.img_data.size = width * height * 3;
|
||||
img_sl.img_data.data = new uint8_t[width * height * 3]();
|
||||
memcpy(img_sl.img_data.data, data, width * height * 3);
|
||||
stbi_image_free(data);
|
||||
LOG_TEE("slot %i - loaded image\n", slot->id);
|
||||
img_sl.request_encode_image = true;
|
||||
slot->images.push_back(img_sl);
|
||||
}
|
||||
@@ -1143,8 +1138,8 @@ struct llama_server_context
|
||||
{
|
||||
continue;
|
||||
}
|
||||
clip_image_f32 img_res;
|
||||
if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
|
||||
clip_image_f32 * img_res = clip_image_f32_init();
|
||||
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
|
||||
{
|
||||
LOG_TEE("Error processing the given image");
|
||||
clip_free(clp_ctx);
|
||||
@@ -1159,11 +1154,12 @@ struct llama_server_context
|
||||
return false;
|
||||
}
|
||||
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
|
||||
if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
|
||||
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
||||
{
|
||||
LOG_TEE("Unable to encode image\n");
|
||||
return false;
|
||||
}
|
||||
clip_image_f32_free(img_res);
|
||||
img.request_encode_image = false;
|
||||
}
|
||||
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1703559957,
|
||||
"narHash": "sha256-x9PUuMEPGUOMB51zNxrDr2QoHbYWlCS2xhFedm9MC5Q=",
|
||||
"lastModified": 1703637592,
|
||||
"narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "75dd68c36f458c6593c5bbb48abfd3e59bfed380",
|
||||
"rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
51
flake.nix
51
flake.nix
@@ -6,6 +6,29 @@
|
||||
flake-parts.url = "github:hercules-ci/flake-parts";
|
||||
};
|
||||
|
||||
# Optional binary cache
|
||||
nixConfig = {
|
||||
extra-substituters = [
|
||||
# Populated by the CI in ggerganov/llama.cpp
|
||||
"https://llama-cpp.cachix.org"
|
||||
|
||||
# A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
||||
# Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
|
||||
# This lets one skip building e.g. the CUDA-enabled openmpi.
|
||||
# TODO: Replace once nix-community obtains an official one.
|
||||
"https://cuda-maintainers.cachix.org"
|
||||
];
|
||||
|
||||
# Verify these are the same keys as published on
|
||||
# - https://app.cachix.org/cache/llama-cpp
|
||||
# - https://app.cachix.org/cache/cuda-maintainers
|
||||
extra-trusted-public-keys = [
|
||||
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
|
||||
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
||||
];
|
||||
};
|
||||
|
||||
|
||||
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
|
||||
#
|
||||
# ```bash
|
||||
@@ -74,26 +97,48 @@
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
system,
|
||||
pkgs,
|
||||
pkgsCuda,
|
||||
pkgsRocm,
|
||||
...
|
||||
}:
|
||||
{
|
||||
# Unlike `.#packages`, legacyPackages may contain values of
|
||||
# arbitrary types (including nested attrsets) and may even throw
|
||||
# exceptions. This attribute isn't recursed into by `nix flake
|
||||
# show` either.
|
||||
#
|
||||
# You can add arbitrary scripts to `.devops/nix/scope.nix` and
|
||||
# access them as `nix build .#llamaPackages.${scriptName}` using
|
||||
# the same path you would with an overlay.
|
||||
legacyPackages = {
|
||||
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||
};
|
||||
|
||||
# We don't use the overlay here so as to avoid making too many instances of nixpkgs,
|
||||
# cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs
|
||||
packages =
|
||||
{
|
||||
default = (pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
|
||||
default = config.legacyPackages.llamaPackages.llama-cpp;
|
||||
}
|
||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||
opencl = config.packages.default.override { useOpenCL = true; };
|
||||
cuda = (pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
|
||||
rocm = (pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
|
||||
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
|
||||
|
||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||
mpi-cuda = config.packages.default.override { useMpi = true; };
|
||||
}
|
||||
// lib.optionalAttrs (system == "x86_64-linux") {
|
||||
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
|
||||
};
|
||||
|
||||
# Packages exposed in `.#checks` will be built by the CI and by
|
||||
# `nix flake check`. Currently we expose all packages, but we could
|
||||
# make more granular choices
|
||||
checks = config.packages;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
39
ggml-cuda.cu
39
ggml-cuda.cu
@@ -119,7 +119,9 @@
|
||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
#define CC_VOLTA 700
|
||||
#define CC_OFFSET_AMD 1000000
|
||||
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
||||
|
||||
#define GGML_CUDA_MAX_NODES 8192
|
||||
|
||||
@@ -133,7 +135,6 @@
|
||||
|
||||
// TODO: improve this to be correct for more hardware
|
||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
||||
// probably other such cases, and not sure what happens on AMD hardware
|
||||
#if !defined(GGML_CUDA_FORCE_MMQ)
|
||||
#define CUDA_USE_TENSOR_CORES
|
||||
#endif
|
||||
@@ -6662,7 +6663,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
|
||||
// pool with virtual memory
|
||||
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
|
||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
|
||||
static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
|
||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||
@@ -8661,11 +8662,25 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type);
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
const bool use_tensor_cores = true;
|
||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
||||
#endif // CUDA_USE_TENSOR_CORES
|
||||
|
||||
#else
|
||||
const bool use_tensor_cores = false;
|
||||
#endif
|
||||
|
||||
const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
|
||||
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
// when tensor cores are available, use them for large batch size
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
||||
use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
|
||||
#endif // CUDA_USE_TENSOR_CORES
|
||||
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
// debug helpers
|
||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||
@@ -8675,13 +8690,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// KQ single-batch
|
||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
// KQ + KQV multi-batch
|
||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
@@ -8701,14 +8716,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
||||
}
|
||||
} else {
|
||||
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
||||
|
||||
// when tensor cores are available, use them for large batch size
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
||||
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
||||
use_mul_mat_q = false;
|
||||
}
|
||||
|
||||
if (use_mul_mat_q) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
||||
} else {
|
||||
|
||||
118
ggml-quants.c
118
ggml-quants.c
@@ -410,13 +410,17 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -2481,8 +2485,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
// dot product into int32x4_t
|
||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
||||
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
||||
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
@@ -2769,8 +2773,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
// dot product into int32x4_t
|
||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
||||
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
||||
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
@@ -2936,11 +2940,11 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
@@ -3228,11 +3232,11 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
||||
@@ -3483,12 +3487,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
||||
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
||||
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
||||
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
@@ -3598,8 +3602,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
// We use this macro instead of a function call because for some reason
|
||||
// the code runs 2-3% slower, even if the function is declared inline
|
||||
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
||||
|
||||
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
||||
@@ -3973,10 +3977,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
|
||||
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
|
||||
|
||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
||||
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
||||
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
|
||||
sum += d * (isum1 + isum2);
|
||||
}
|
||||
@@ -4256,10 +4260,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
@@ -4273,10 +4277,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
@@ -4757,10 +4761,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
|
||||
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
|
||||
sum += d * isum;
|
||||
|
||||
@@ -5109,14 +5113,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||
|
||||
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
||||
|
||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||
|
||||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
|
||||
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
||||
}
|
||||
@@ -5449,13 +5453,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||
|
||||
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
|
||||
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||
|
||||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
||||
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
||||
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
||||
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
@@ -5722,8 +5726,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
|
||||
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
|
||||
|
||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
||||
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
||||
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
||||
}
|
||||
|
||||
sumf += d * sumi - dmin * sumi_mins;
|
||||
@@ -6112,10 +6116,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
|
||||
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
|
||||
|
||||
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
||||
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
||||
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
||||
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
||||
int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
||||
int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
||||
int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
||||
int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
||||
|
||||
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
||||
}
|
||||
@@ -6399,10 +6403,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
|
||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
@@ -6426,10 +6430,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
|
||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
scale += 4;
|
||||
}
|
||||
//sum += isum * d_all * y[i].d;
|
||||
@@ -6816,10 +6820,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
|
||||
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
|
||||
sum += isum * d_all * y[i].d;
|
||||
|
||||
|
||||
8
ggml.c
8
ggml.c
@@ -19638,6 +19638,14 @@ int ggml_cpu_has_avx(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx_vnni(void) {
|
||||
#if defined(__AVXVNNI__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx2(void) {
|
||||
#if defined(__AVX2__)
|
||||
return 1;
|
||||
|
||||
1
ggml.h
1
ggml.h
@@ -2198,6 +2198,7 @@ extern "C" {
|
||||
//
|
||||
|
||||
GGML_API int ggml_cpu_has_avx (void);
|
||||
GGML_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_API int ggml_cpu_has_avx512 (void);
|
||||
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
||||
|
||||
@@ -10780,6 +10780,7 @@ const char * llama_print_system_info(void) {
|
||||
|
||||
s = "";
|
||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
|
||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
||||
|
||||
Reference in New Issue
Block a user