Compare commits

...

6 Commits
b4506 ... b4512

Author SHA1 Message Date
Kyle Bruene
ae3c1db2f9 llama : re-add LLM_ARCH_PHIMOE (#11305)
Phi 3.5 MoE was partially removed during a refactor. The code was originally in llama.cpp and should be in llama-model.cpp after the refactor.
2025-01-20 09:21:01 +02:00
Georgi Gerganov
92bc493917 tests : increase timeout when sanitizers are enabled (#11300)
* tests : increase timeout when sanitizers are enabled

* tests : add DEFAULT_HTTP_TIMEOUT
2025-01-19 20:22:30 +02:00
Georgi Gerganov
b9daaffe02 simple-chat : fix BOS being added to each message (#11278) 2025-01-19 18:12:09 +02:00
Nicolò Scipione
99487b57d4 SYCL: Introducing memory host pool (#11251)
* Implement host pool for matrix_info

Creating a new memory pool on the host to store memory location for
matrix_info needed to launch gemm_batch from oneMKL/oneMath.
Removing complex support in gemm_batch since it is not used in llama.cpp

* Remove unnecessary headers and cast

* Reorder member variable to avoid warning on initialization

* Formatting

* Remove unused variable

* Address PR review feedback - remove warning

---------

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
2025-01-19 21:33:34 +08:00
Eric Curtin
a1649cc13f Adding linenoise.cpp to llama-run (#11252)
This is a fork of linenoise that is C++17 compatible. I intend on
adding it to llama-run so we can do things like traverse prompt
history via the up and down arrows:

https://github.com/ericcurtin/linenoise.cpp

Signed-off-by: Eric Curtin <ecurtin@redhat.com>
2025-01-18 14:42:31 +00:00
Georgi Gerganov
4dd34ff831 cmake : add sanitizer flags for llama.cpp (#11279)
* cmake : add sanitizer flags for llama.cpp

ggml-ci

* tests : fix compile warnings

ggml-ci

* cmake : move sanitizer flags to llama_add_compile_flags

ggml-ci

* cmake : move llama.cpp compile flags to top level lists

ggml-ci

* cmake : apply only sanitizer flags at top level

ggml-ci

* tests : fix gguf context use in same_tensor_data

* gguf-test: tensor data comparison

* dummy : trigger ggml-ci

* unicode : silence gcc warnings

ggml-ci

* ci : use sanitizer builds only in Debug mode

ggml-ci

* cmake : add status messages [no ci]

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2025-01-18 16:18:15 +02:00
19 changed files with 1808 additions and 162 deletions

View File

@@ -87,6 +87,7 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts
@@ -149,6 +150,7 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts
@@ -217,6 +219,7 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
- name: Upload artifacts
@@ -234,7 +237,7 @@ jobs:
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
build_type: [Debug]
steps:
- name: Clone
@@ -796,6 +799,7 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts

View File

@@ -112,9 +112,9 @@ jobs:
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
@@ -124,12 +124,31 @@ jobs:
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
run: |
cd examples/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd examples/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}

View File

@@ -83,11 +83,8 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
# override ggml options
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
# change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE)
@@ -117,16 +114,62 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
message(STATUS "Using -fsanitize=thread")
add_compile_options(-fsanitize=thread)
link_libraries (-fsanitize=thread)
endif()
if (LLAMA_SANITIZE_ADDRESS)
message(STATUS "Using -fsanitize=address")
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
link_libraries (-fsanitize=address)
endif()
if (LLAMA_SANITIZE_UNDEFINED)
message(STATUS "Using -fsanitize=undefined")
add_compile_options(-fsanitize=undefined)
link_libraries (-fsanitize=undefined)
endif()
endif()
#
# build the library
# 3rd-party
#
if (NOT TARGET ggml)
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
#
# build the library
#
add_subdirectory(src)
#
# utils, programs, examples and tests
#
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()
#
# install
#
@@ -200,21 +243,3 @@ configure_file(cmake/llama.pc.in
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION lib/pkgconfig)
#
# utils, programs, examples and tests
#
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()

View File

@@ -1,5 +1,5 @@
set(TARGET llama-run)
add_executable(${TARGET} run.cpp)
add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -0,0 +1,26 @@
Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,114 @@
/* linenoise.h -- VERSION 1.0
*
* Guerrilla line editing library against the idea that a line editing lib
* needs to be 20,000 lines of C++ code.
*
* See linenoise.cpp for more information.
*
* ------------------------------------------------------------------------
*
* Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
* Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
* Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __LINENOISE_H
#define __LINENOISE_H
#ifdef __cplusplus
extern "C" {
#endif
#include <stddef.h> /* For size_t. */
extern const char *linenoiseEditMore;
/* The linenoiseState structure represents the state during line editing.
* We pass this state to functions implementing specific editing
* functionalities. */
struct linenoiseState {
int in_completion; /* The user pressed TAB and we are now in completion
* mode, so input is handled by completeLine(). */
size_t completion_idx; /* Index of next completion to propose. */
int ifd; /* Terminal stdin file descriptor. */
int ofd; /* Terminal stdout file descriptor. */
char *buf; /* Edited line buffer. */
size_t buflen; /* Edited line buffer size. */
const char *prompt; /* Prompt to display. */
size_t plen; /* Prompt length. */
size_t pos; /* Current cursor position. */
size_t oldpos; /* Previous refresh cursor position. */
size_t len; /* Current edited line length. */
size_t cols; /* Number of columns in terminal. */
size_t oldrows; /* Rows used by last refrehsed line (multiline mode) */
int history_index; /* The history index we are currently editing. */
};
typedef struct linenoiseCompletions {
size_t len;
char **cvec;
} linenoiseCompletions;
/* Non blocking API. */
int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
const char *linenoiseEditFeed(struct linenoiseState *l);
void linenoiseEditStop(struct linenoiseState *l);
void linenoiseHide(struct linenoiseState *l);
void linenoiseShow(struct linenoiseState *l);
/* Blocking API. */
const char *linenoise(const char *prompt);
void linenoiseFree(void *ptr);
/* Completion API. */
typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
typedef void(linenoiseFreeHintsCallback)(const char *);
void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
void linenoiseSetHintsCallback(linenoiseHintsCallback *);
void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
void linenoiseAddCompletion(linenoiseCompletions *, const char *);
/* History API. */
int linenoiseHistoryAdd(const char *line);
int linenoiseHistorySetMaxLen(int len);
int linenoiseHistorySave(const char *filename);
int linenoiseHistoryLoad(const char *filename);
/* Other utilities. */
void linenoiseClearScreen(void);
void linenoiseSetMultiLine(int ml);
void linenoisePrintKeyCodes(void);
void linenoiseMaskModeEnable(void);
void linenoiseMaskModeDisable(void);
#ifdef __cplusplus
}
#endif
#endif /* __LINENOISE_H */

View File

@@ -19,12 +19,14 @@
#include <cstring>
#include <filesystem>
#include <iostream>
#include <list>
#include <sstream>
#include <string>
#include <vector>
#include "common.h"
#include "json.hpp"
#include "linenoise.cpp/linenoise.h"
#include "llama-cpp.h"
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
@@ -536,7 +538,7 @@ class LlamaData {
llama_sampler_ptr sampler;
llama_context_ptr context;
std::vector<llama_chat_message> messages;
std::vector<std::string> msg_strs;
std::list<std::string> msg_strs;
std::vector<char> fmtted;
int init(Opt & opt) {
@@ -807,24 +809,44 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
batch = llama_batch_get_one(&new_token_id, 1);
}
printf("\033[0m");
return 0;
}
static int read_user_input(std::string & user) {
std::getline(std::cin, user);
static int read_user_input(std::string & user_input) {
static const char * prompt_prefix = "> ";
#ifdef WIN32
printf(
"\r%*s"
"\r\033[0m%s",
get_terminal_width(), " ", prompt_prefix);
std::getline(std::cin, user_input);
if (std::cin.eof()) {
printf("\n");
return 1;
}
if (user == "/bye") {
#else
std::unique_ptr<char, decltype(&std::free)> line(const_cast<char *>(linenoise(prompt_prefix)), free);
if (!line) {
return 1;
}
if (user.empty()) {
user_input = line.get();
#endif
if (user_input == "/bye") {
return 1;
}
if (user_input.empty()) {
return 2;
}
#ifndef WIN32
linenoiseHistoryAdd(line.get());
#endif
return 0; // Should have data in happy path
}
@@ -865,10 +887,6 @@ static int handle_user_input(std::string & user_input, const std::string & user)
return 0; // No need for interactive input
}
printf(
"\r%*s"
"\r\033[32m> \033[0m",
get_terminal_width(), " ");
return read_user_input(user_input); // Returns true if input ends the loop
}

View File

@@ -26,6 +26,9 @@ from re import RegexFlag
import wget
DEFAULT_HTTP_TIMEOUT = 10 if "LLAMA_SANITIZE" not in os.environ else 30
class ServerResponse:
headers: dict
status_code: int
@@ -88,7 +91,7 @@ class ServerProcess:
if "PORT" in os.environ:
self.server_port = int(os.environ["PORT"])
def start(self, timeout_seconds: int = 10) -> None:
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
if "LLAMA_SERVER_BIN_PATH" in os.environ:
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
elif os.name == "nt":

View File

@@ -95,11 +95,11 @@ int main(int argc, char ** argv) {
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
// helper function to evaluate a prompt and generate a response
auto generate = [&](const std::string & prompt) {
auto generate = [&](const std::string & prompt, bool is_first) {
std::string response;
// tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
GGML_ABORT("failed to tokenize the prompt\n");
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
// generate a response
printf("\033[33m");
std::string response = generate(prompt);
std::string response = generate(prompt, prev_len == 0);
printf("\n\033[0m");
// add the response to the messages

View File

@@ -333,8 +333,12 @@ struct ggml_backend_sycl_context {
// pool
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
ggml_sycl_pool & pool(int device) {
if (pools[device] == nullptr) {
pools[device] = new_pool_for_device(stream(device,0), device);
@@ -345,6 +349,15 @@ struct ggml_backend_sycl_context {
ggml_sycl_pool & pool() {
return pool(device);
}
ggml_sycl_pool & host_pool(int device) {
if (host_pools[device] == nullptr) {
host_pools[device] = new_pool_for_host(stream(device, 0), device);
}
return *host_pools[device];
}
ggml_sycl_pool & host_pool() { return host_pool(device); }
};
// common device functions

View File

@@ -82,6 +82,14 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
return device_type.str();
}
template <typename Ts> struct matrix_info_t {
oneapi::mkl::transpose transpose_info[2];
Ts value_info[2];
std::int64_t size_info[3];
std::int64_t ld_info[3];
std::int64_t groupsize_info;
};
namespace dpct
{
typedef sycl::queue *queue_ptr;
@@ -1727,26 +1735,13 @@ namespace dpct
};
template <class Ta, class Tb, class Tc, class Ts>
inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
oneapi::mkl::transpose b_trans, int m, int n, int k,
const void *alpha, const void **a, int lda,
const void **b, int ldb, const void *beta, void **c,
int ldc, int batch_size)
{
struct matrix_info_t
{
oneapi::mkl::transpose transpose_info[2];
Ts value_info[2];
std::int64_t size_info[3];
std::int64_t ld_info[3];
std::int64_t groupsize_info;
};
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
int ldb, const void * beta, void ** c, int ldc, int batch_size,
matrix_info_t<float> * matrix_info) {
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
matrix_info_t *matrix_info =
(matrix_info_t *)std::malloc(sizeof(matrix_info_t));
matrix_info->transpose_info[0] = a_trans;
matrix_info->transpose_info[1] = b_trans;
matrix_info->value_info[0] = alpha_value;
@@ -1763,23 +1758,18 @@ namespace dpct
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast<const Ta **>(a),
matrix_info->ld_info, reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1,
&(matrix_info->groupsize_info));
matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
#else
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info,
matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
#endif
q.submit([&](sycl::handler &cgh)
{
cgh.depends_on(e);
cgh.host_task([=] { std::free(matrix_info); }); });
}
template <class Ta, class Tb, class Tc, class Ts>
@@ -2422,25 +2412,11 @@ namespace dpct
/// \param [in] ldc Leading dimension of C.
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
/// \param [in] scaling_type Data type of the scaling factors.
inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
oneapi::mkl::transpose b_trans, int m, int n, int k,
const void *alpha, const void *a[],
library_data_t a_type, int lda, const void *b[],
library_data_t b_type, int ldb, const void *beta,
void *c[], library_data_t c_type, int ldc,
int batch_size, library_data_t scaling_type)
{
if (scaling_type == library_data_t::real_float &&
c_type == library_data_t::complex_float)
{
scaling_type = library_data_t::complex_float;
}
else if (scaling_type == library_data_t::real_double &&
c_type == library_data_t::complex_double)
{
scaling_type = library_data_t::complex_double;
}
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
matrix_info_t<float> * matrix_info) {
std::uint64_t key =
detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
switch (key)
@@ -2449,48 +2425,24 @@ namespace dpct
library_data_t::real_float, library_data_t::real_float,
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<float, float, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
detail::gemm_batch_impl<float, float, float, float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
beta, c, ldc, batch_size, matrix_info);
break;
}
case detail::get_type_combination_id(
library_data_t::real_double, library_data_t::real_double,
library_data_t::real_double, library_data_t::real_double):
{
detail::gemm_batch_impl<double, double, double, double>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
library_data_t::complex_float, library_data_t::complex_float,
library_data_t::complex_float, library_data_t::complex_float):
{
detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
std::complex<float>, std::complex<float>>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
library_data_t::complex_double, library_data_t::complex_double,
library_data_t::complex_double, library_data_t::complex_double):
{
detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
std::complex<double>, std::complex<double>>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
detail::gemm_batch_impl<double, double, double, double>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
beta, c, ldc, batch_size, matrix_info);
break;
}
case detail::get_type_combination_id(
library_data_t::real_half, library_data_t::real_half,
library_data_t::real_half, library_data_t::real_half):
{
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
a, lda, b, ldb, beta, c, ldc,
batch_size);
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break;
}
#ifdef __INTEL_MKL__
@@ -2498,19 +2450,16 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float):
{
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break;
}
case detail::get_type_combination_id(
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
b, ldb, beta, c, ldc, batch_size);
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break;
}
#endif
@@ -2522,10 +2471,9 @@ namespace dpct
dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
float beta_float =
dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
float>(q, a_trans, b_trans, m, n, k, &alpha_float,
a, lda, b, ldb, &beta_float, c, ldc,
batch_size);
detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, float>(
q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size,
matrix_info);
break;
}
case detail::get_type_combination_id(
@@ -2533,8 +2481,7 @@ namespace dpct
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break;
}
case detail::get_type_combination_id(
@@ -2542,8 +2489,7 @@ namespace dpct
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break;
}
case detail::get_type_combination_id(
@@ -2557,8 +2503,7 @@ namespace dpct
sycl::half alpha_half(alpha_value);
sycl::half beta_half(beta_value);
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
batch_size);
q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info);
break;
}
default:

View File

@@ -1173,6 +1173,85 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
}
};
struct ggml_sycl_pool_host : public ggml_sycl_pool {
queue_ptr qptr;
int device;
inline static int counter{ 0 };
struct ggml_sycl_buffer {
void * ptr = nullptr;
size_t size = 0;
};
// Set arbitrarly to 64
static constexpr int MAX_POOL_SIZE{ 64 };
std::vector<ggml_sycl_buffer> buffer_pool = std::vector<ggml_sycl_buffer>(MAX_POOL_SIZE);
size_t pool_size = 0;
explicit ggml_sycl_pool_host(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
~ggml_sycl_pool_host() {
for (int i = 0; i < MAX_POOL_SIZE; ++i) {
ggml_sycl_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
b.ptr = nullptr;
pool_size -= b.size;
b.size = 0;
}
}
counter = 0;
}
void * alloc(size_t size, size_t * actual_size) override {
if (counter == MAX_POOL_SIZE) {
ggml_sycl_buffer b = buffer_pool[0];
void * ptr = b.ptr;
*actual_size = b.size;
counter = 1;
return ptr;
}
ggml_sycl_buffer & b = buffer_pool[counter];
if (b.ptr == nullptr) {
void * ptr;
SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *) sycl::malloc_host(size, *qptr)));
if (!ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
return nullptr;
}
pool_size += size;
*actual_size = size;
counter = counter + 1;
return ptr;
} else {
++counter;
b.size = size;
return b.ptr;
}
}
void free(void * ptr, size_t size) override {
// if the pool is not completed add the pointer to it in place of the first nullptr found.
// Otherwise do nothing, pointers will be freed once the pool is deallocated.
for (int i = 0; i < MAX_POOL_SIZE; ++i) {
ggml_sycl_buffer & b = buffer_pool[i];
if (b.ptr == nullptr) {
b.ptr = ptr;
b.size = size;
return;
}
}
}
};
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(queue_ptr qptr, int device) {
// return pool for the host to speed up memory management
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_host(qptr, device));
}
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
// TBD: NO VMM support
// if (ggml_sycl_info().devices[device].vmm) {
@@ -3363,6 +3442,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
ggml_sycl_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
sycl::range<3> block_dims(1, ne12, ne13);
/*
@@ -3391,14 +3471,10 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
});
}
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*main_stream, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **)(ptrs_src.get() + 0 * ne23),
dpct::library_data_t::real_half, nb01 / nb00,
(const void **)(ptrs_src.get() + 1 * ne23),
dpct::library_data_t::real_half, nb11 / nb10, beta,
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
cu_compute_type)));
*main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
}
}
catch (sycl::exception const &exc) {

View File

@@ -648,6 +648,10 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
ok = ok && data != nullptr;
if (ok) {
ggml_set_name(data, "GGUF tensor data binary blob");
}
// read the binary blob with the tensor data
ok = ok && gr.read(data->data, ctx->size);

View File

@@ -2203,6 +2203,50 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
}
} break;
case LLM_ARCH_PHIMOE:
{
const int64_t n_embd_head = n_embd / n_head;
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
if (layer.wqkv == nullptr) {
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
}
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
}
} break;
case LLM_ARCH_PLAMO:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

View File

@@ -7,18 +7,17 @@
#include <algorithm>
#include <cassert>
#include <codecvt>
#include <cstddef>
#include <cstdint>
#include <locale>
#include <map>
#include <regex>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <locale>
#include <codecvt>
size_t unicode_len_utf8(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };

View File

@@ -1,3 +1,5 @@
llama_add_compile_flags()
function(llama_test target)
include(CMakeParseArguments)
set(options)

View File

@@ -48,7 +48,7 @@ enum handcrafted_file_type {
HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data,
};
std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
static std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
switch (hft) {
case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC";
case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1";
@@ -99,7 +99,7 @@ static bool expect_context_not_null(const enum handcrafted_file_type hft) {
typedef std::pair<enum ggml_type, std::array<int64_t, GGML_MAX_DIMS>> tensor_config_t;
std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
static std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
std::vector<tensor_config_t> tensor_configs;
tensor_configs.reserve(100);
@@ -122,7 +122,7 @@ std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
return tensor_configs;
}
std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937 rng) {
static std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937 rng) {
std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
kv_types.reserve(100);
@@ -626,8 +626,6 @@ static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const u
bool ok = true;
const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
for (int i = 0; i < int(tensor_configs.size()); ++i) {
const ggml_type type = tensor_configs[i].first;
const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
@@ -866,13 +864,13 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t
case GGUF_TYPE_COUNT:
default: {
GGML_ABORT("fatal error");
} break;
}
}
} break;
case GGUF_TYPE_COUNT:
default: {
GGML_ABORT("fatal error");
} break;
}
}
}
@@ -938,7 +936,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
}
if (type == GGUF_TYPE_ARRAY) {
const int arr_n = gguf_get_arr_n(ctx, id);
const size_t arr_n = gguf_get_arr_n(ctx, id);
if (arr_n != gguf_get_arr_n(other, idx_other)) {
ok = false;
continue;
@@ -953,7 +951,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
if (type_arr == GGUF_TYPE_BOOL) {
const int8_t * data = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx, id));
const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
if (bool(data[arr_i]) != bool(data_other[arr_i])) {
ok = false;
}
@@ -962,7 +960,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
}
if (type_arr == GGUF_TYPE_STRING) {
for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
const std::string str = gguf_get_arr_str(ctx, id, arr_i);
const std::string str_other = gguf_get_arr_str(other, idx_other, arr_i);
if (str != str_other) {
@@ -1033,6 +1031,12 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
struct ggml_tensor * t_orig = ggml_get_first_tensor(orig);
struct ggml_tensor * t_read = ggml_get_first_tensor(read);
if (std::string(t_read->name) != "GGUF tensor data binary blob") {
return false;
}
t_read = ggml_get_next_tensor(read, t_read);
while (t_orig) {
if (!t_read) {
ok = false;
@@ -1051,13 +1055,13 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
}
t_orig = ggml_get_next_tensor(orig, t_orig);
t_read = ggml_get_next_tensor(orig, t_read);
t_read = ggml_get_next_tensor(read, t_read);
}
if (t_read) {
ok = false;
}
return true;
return ok;
}
static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {

View File

@@ -144,7 +144,6 @@ static void test_penalties(
sampler_tester tester(probs, probs_expected);
const size_t n_vocab = probs.size();
auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
for (size_t i = 0; i < last_tokens.size(); i++) {