Compare commits

...

8 Commits
b2042 ... b2050

Author SHA1 Message Date
kalomaze
191221178f perplexity : fix KL divergence calculations on Windows (#5273) 2024-02-02 16:15:30 +02:00
Georgi Gerganov
e437b37fd0 scripts : parse wtype in server-llm.sh (#5167)
* scripts : parse wtype in server-llm.sh

* scripts : fix check for wfile
2024-02-02 14:23:40 +02:00
Mirror Azure
2d40085c26 py : add check for '.attn.masked_bias' layers to GPT2model (#5281) 2024-02-02 13:39:09 +02:00
AidanBeltonS
b05102fe8c Tidy ggml-sycl (#5261)
* Tidy some code in ggml-sycl

* Remove blank space

* Remove std::printf comments

---------

Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
2024-02-02 16:39:48 +08:00
Xuan Son Nguyen
6b91b1e0a9 docker : add build for SYCL, Vulkan + update readme (#5228)
* add vulkan dockerfile

* intel dockerfile: compile sycl by default

* fix vulkan dockerfile

* add docs for vulkan

* docs: sycl build in docker

* docs: remove trailing spaces

* docs: sycl: add docker section

* docs: clarify install vulkan SDK outside docker

* sycl: use intel/oneapi-basekit docker image

* docs: correct TOC

* docs: correct docker image for Intel oneMKL
2024-02-02 09:56:31 +02:00
Meng, Hengyu
e805f0fa99 [SYCL] get MAX_MEM_ALLOC from device property (#5270)
* get max alloc size from device prop

* fix macro typo
2024-02-02 15:54:14 +08:00
Neo Zhang Jianyu
af3ba5d946 [SYCL] update guide of SYCL backend (#5254)
* update guide for make installation, memory, gguf model link,  rm todo for windows build

* add vs install requirement

* update for gpu device check

* update help of llama-bench

* fix grammer issues
2024-02-02 15:53:27 +08:00
Ian Bull
e1e721094d llama : fix memory leak in llama_batch_free (#5252)
The llama_batch_init allocates memory for a fixed number of tokens.
However, the llama_batch_free only frees memory for the number of
tokens that were added to the batch.

This change-set uses a null terminated array for the batch seq_id, and
frees all the elements until the nullptr is reached. This change-set
also changes the name of the first parameter from `n_tokens` to
`n_tokens_alloc` to more clearly indicate that this value is the number
of tokens allocated to the batch, not the number of tokens in the batch.
2024-02-02 09:20:13 +02:00
13 changed files with 350 additions and 130 deletions

View File

@@ -1,8 +1,8 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG UBUNTU_VERSION=22.04
FROM intel/hpckit:$ONEAPI_VERSION as build
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
@@ -10,16 +10,18 @@ WORKDIR /app
COPY . .
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \
cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
cmake --build . --config Release --target main server
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target main
FROM ubuntu:$UBUNTU_VERSION as runtime
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/main /main
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8

View File

@@ -0,0 +1,29 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk
# Build it
WORKDIR /app
COPY . .
RUN mkdir build && \
cd build && \
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target main
# Clean up
WORKDIR /
RUN cp /app/build/bin/main /main && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View File

@@ -1,8 +1,8 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG UBUNTU_VERSION=22.04
FROM intel/hpckit:$ONEAPI_VERSION as build
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
@@ -10,13 +10,16 @@ WORKDIR /app
COPY . .
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \
cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
cmake --build . --config Release --target main server
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target server
FROM ubuntu:$UBUNTU_VERSION as runtime
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/server /server

View File

@@ -0,0 +1,29 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk
# Build it
WORKDIR /app
COPY . .
RUN mkdir build && \
cd build && \
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target server
# Clean up
WORKDIR /
RUN cp /app/build/bin/server /server && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

View File

@@ -1,22 +1,15 @@
# llama.cpp for SYCL
[Background](#background)
[OS](#os)
[Intel GPU](#intel-gpu)
[Linux](#linux)
[Windows](#windows)
[Environment Variable](#environment-variable)
[Known Issue](#known-issue)
[Q&A](#q&a)
[Todo](#todo)
- [Background](#background)
- [OS](#os)
- [Intel GPU](#intel-gpu)
- [Docker](#docker)
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
- [Known Issue](#known-issue)
- [Q&A](#q&a)
- [Todo](#todo)
## Background
@@ -36,20 +29,65 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|OS|Status|Verified|
|-|-|-|
|Linux|Support|Ubuntu 22.04|
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|Windows|Support|Windows 11|
## Intel GPU
### Verified
|Intel GPU| Status | Verified Model|
|-|-|-|
|Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770, 730M|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
### Memory
The memory is a limitation to run LLM on GPUs.
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`.
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
## Docker
Note:
- Only docker on Linux is tested. Docker on WSL may not work.
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
### Build the image
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
```sh
# For F16:
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
# Or, for F32:
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
```
### Run
```sh
# Firstly, find all the DRI cards:
ls -la /dev/dri
# Then, pick the card that you want to use.
# For example with "/dev/dri/card1"
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
## Linux
@@ -63,7 +101,7 @@ Note: for iGPU, please install the client GPU driver.
b. Add user to group: video, render.
```
```sh
sudo usermod -aG render username
sudo usermod -aG video username
```
@@ -72,7 +110,7 @@ Note: re-login to enable it.
c. Check
```
```sh
sudo apt install clinfo
sudo clinfo -l
```
@@ -90,7 +128,6 @@ Platform #0: Intel(R) OpenCL HD Graphics
2. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
@@ -99,13 +136,13 @@ Following guide use the default folder as example. If you use other folder, plea
b. Check
```
```sh
source /opt/intel/oneapi/setvars.sh
sycl-ls
```
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
@@ -118,21 +155,25 @@ Output (example):
2. Build locally:
```
Note:
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
```sh
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
# For FP16:
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
#for FP32
# Or, for FP32:
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only
# Build example/main only
#cmake --build . --config Release --target main
#build all binary
# Or, build all binary
cmake --build . --config Release -v
cd ..
@@ -140,18 +181,16 @@ cd ..
or
```
```sh
./examples/sycl/build.sh
```
Note:
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### Run
1. Put model file to folder **models**
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
2. Enable oneAPI running environment
```
@@ -162,10 +201,10 @@ source /opt/intel/oneapi/setvars.sh
Run without parameter:
```
```sh
./build/bin/ls-sycl-device
or
# or running the "main" executable and look at the output log:
./build/bin/main
```
@@ -194,13 +233,13 @@ found 4 SYCL devices:
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
```
```sh
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
or run by script:
```
./examples/sycl/run-llama2.sh
```sh
./examples/sycl/run_llama2.sh
```
Note:
@@ -223,7 +262,13 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
2. Install Intel® oneAPI Base toolkit.
Note: **The driver is mandatory for compute function**.
2. Install Visual Studio.
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
3. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
@@ -252,7 +297,7 @@ In oneAPI command line:
sycl-ls
```
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
@@ -260,15 +305,21 @@ Output (example):
[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
```
3. Install cmake & make
4. Install cmake & make
a. Download & install cmake for windows: https://cmake.org/download/
a. Download & install cmake for Windows: https://cmake.org/download/
b. Download & install make for windows provided by mingw-w64: https://www.mingw-w64.org/downloads/
b. Download & install make for Windows provided by mingw-w64
- Download binary package for Windows in https://github.com/niXman/mingw-builds-binaries/releases.
Like [x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z](https://github.com/niXman/mingw-builds-binaries/releases/download/13.2.0-rt_v11-rev1/x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z).
- Unzip the binary package. In the **bin** sub-folder and rename **xxx-make.exe** to **make.exe**.
- Add the **bin** folder path in the Windows system PATH environment.
### Build locally:
@@ -309,6 +360,8 @@ Note:
1. Put model file to folder **models**
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
2. Enable oneAPI running environment
- In Search, input 'oneAPI'.
@@ -419,8 +472,25 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
Miss to enable oneAPI running environment.
- Meet compile error.
Remove folder **build** and try again.
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
Please run **sudo sycl-ls**.
If you see it in result, please add video/render group to your ID:
```
sudo usermod -aG render username
sudo usermod -aG video username
```
Then **relogin**.
If you do not see it, please check the installation GPU steps again.
## Todo
- Support to build in Windows.
- Support multiple cards.

View File

@@ -393,28 +393,28 @@ Building the program with BLAS support may lead to some performance improvements
Check [BLIS.md](docs/BLIS.md) for more information.
- #### SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
- #### Intel oneMKL
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
- Using manual oneAPI installation:
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash
mkdir build
cd build
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-runtime docker image, only required for manual installation
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release
```
- Using oneAPI docker image:
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
```bash
mkdir build
cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release
```
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
@@ -601,14 +601,48 @@ Building the program with BLAS support may lead to some performance improvements
You can get a list of platforms and devices from the `clinfo -l` command, etc.
- #### SYCL
- #### Vulkan
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
**With docker**:
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
You don't need to install Vulkan SDK. It will be installed inside the container.
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
```sh
# Build the image
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
# Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
**Without docker**:
Firstly, you need to make sure you installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
For example, on Ubuntu 22.04 (jammy), use the command below:
```bash
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
apt update -y
apt-get install -y vulkan-sdk
# To verify the installation, use the command below:
vulkaninfo
```
Then, build llama.cpp using the cmake command below:
```bash
mkdir -p build
cd build
cmake .. -DLLAMA_VULKAN=1
cmake --build . --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
```
### Prepare Data & Run

View File

@@ -1138,7 +1138,7 @@ class GPT2Model(Model):
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
continue
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):

View File

@@ -23,19 +23,23 @@ usage: ./llama-bench [options]
options:
-h, --help
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-b, --batch-size <n> (default: 512)
--memory-f32 <0|1> (default: 0)
-t, --threads <n> (default: 16)
-ngl N, --n-gpu-layers <n> (default: 99)
-mg i, --main-gpu <i> (default: 0)
-mmq, --mul-mat-q <0|1> (default: 1)
-ts, --tensor_split <ts0/ts1/..>
-r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md)
-v, --verbose (default: 0)
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-b, --batch-size <n> (default: 512)
-ctk <t>, --cache-type-k <t> (default: f16)
-ctv <t>, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 112)
-ngl, --n-gpu-layers <n> (default: 99)
-sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
-mmq, --mul-mat-q <0|1> (default: 1)
-ts, --tensor_split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md)
-v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
```
@@ -51,6 +55,10 @@ Each test is repeated the number of times given by `-r`, and the results are ave
For a description of the other options, see the [main example](../main/README.md).
Note:
- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.
## Examples
### Text generation with different models

View File

@@ -457,14 +457,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
logits_stream.open(params.logits_file.c_str());
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
if (!logits_stream.is_open()) {
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
return {};
}
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8);
logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
}
auto tim1 = std::chrono::high_resolution_clock::now();

View File

@@ -2,7 +2,7 @@
:: Copyright (C) 2024 Intel Corporation
:: SPDX-License-Identifier: MIT
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

View File

@@ -337,6 +337,7 @@ namespace dpct
}
size_t get_global_mem_size() const { return _global_mem_size; }
size_t get_local_mem_size() const { return _local_mem_size; }
size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; }
/// Returns the maximum clock rate of device's global memory in kHz. If
/// compiler does not support this API then returns default value 3200000 kHz.
unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
@@ -398,6 +399,10 @@ namespace dpct
{
_local_mem_size = local_mem_size;
}
void set_max_mem_alloc_size(size_t max_mem_alloc_size)
{
_max_mem_alloc_size = max_mem_alloc_size;
}
void set_max_work_group_size(int max_work_group_size)
{
_max_work_group_size = max_work_group_size;
@@ -465,6 +470,7 @@ namespace dpct
int _max_register_size_per_work_group;
size_t _global_mem_size;
size_t _local_mem_size;
size_t _max_mem_alloc_size;
size_t _max_nd_range_size[3];
int _max_nd_range_size_i[3];
uint32_t _device_id;
@@ -516,6 +522,7 @@ namespace dpct
dev.get_info<sycl::info::device::max_work_group_size>());
prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
prop.set_max_mem_alloc_size(dev.get_info<sycl::info::device::max_mem_alloc_size>());
#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
@@ -644,6 +651,11 @@ namespace dpct
return get_device_info().get_global_mem_size();
}
size_t get_max_mem_alloc_size() const
{
return get_device_info().get_max_mem_alloc_size();
}
/// Get the number of bytes of free and total memory on the SYCL device.
/// \param [out] free_memory The number of bytes of free memory on the SYCL device.
/// \param [out] total_memory The number of bytes of total memory on the SYCL device.
@@ -1354,6 +1366,7 @@ namespace dpct
}
#else
return q.memcpy(to_ptr, from_ptr, size, dep_events);
GGML_UNUSED(direction);
#endif // DPCT_USM_LEVEL_NONE
}
@@ -1655,7 +1668,7 @@ namespace dpct
using Ty = typename DataType<T>::T2;
Ty s_h;
if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host)
.wait();
else
s_h = *reinterpret_cast<const Ty *>(s);
@@ -1679,6 +1692,20 @@ namespace dpct
int ldb, const void *beta, void *c, int ldc)
{
#ifndef __INTEL_MKL__
GGML_UNUSED(q);
GGML_UNUSED(a_trans);
GGML_UNUSED(b_trans);
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(alpha);
GGML_UNUSED(a);
GGML_UNUSED(lda);
GGML_UNUSED(b);
GGML_UNUSED(ldb);
GGML_UNUSED(beta);
GGML_UNUSED(c);
GGML_UNUSED(ldc);
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
"Project does not support this API.");
#else
@@ -1818,7 +1845,7 @@ namespace dpct
template <typename T>
T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
int logical_sub_group_size = 32)
unsigned int logical_sub_group_size = 32)
{
unsigned int id = g.get_local_linear_id();
unsigned int start_index =
@@ -2148,6 +2175,7 @@ namespace dpct
}
#else
return q.memcpy(to_ptr, from_ptr, size, dep_events);
GGML_UNUSED(direction);
#endif // DPCT_USM_LEVEL_NONE
}
@@ -3290,7 +3318,7 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo
std::ofstream logfile;
logfile.open(filename);
// printf("local buf element %d\n", total_elements);
for(int i=0; i<total_elements; i++){
for(size_t i=0; i<total_elements; i++){
if((i+1)%20 ==0) logfile <<std::endl;
else logfile << local_buf[i] <<" ";
}
@@ -3384,6 +3412,7 @@ static __dpct_inline__ float warp_reduce_max(float x,
static __dpct_inline__ float op_repeat(const float a, const float b) {
return b;
GGML_UNUSED(a);
}
static __dpct_inline__ float op_add(const float a, const float b) {
@@ -11144,10 +11173,10 @@ DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.
// g_sycl_pool_handles[GGML_SYCL_MAX_DEVICES];
static dpct::device_ptr g_sycl_pool_addr[GGML_SYCL_MAX_DEVICES] = {0};
static size_t g_sycl_pool_used[GGML_SYCL_MAX_DEVICES] = {0};
static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
static void *ggml_sycl_pool_malloc_vmm(size_t size, size_t *actual_size) try {
GGML_UNUSED(size);
GGML_UNUSED(actual_size);
return NULL;
}
catch (sycl::exception const &exc) {
@@ -11311,10 +11340,10 @@ void ggml_init_sycl() try {
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
int64_t total_vram = 0;
#if defined(GGML_SYCL_FP16)
fprintf(stderr, "%s: GGML_SYCL_FP16: yes\n", __func__);
#if defined(GGML_SYCL_F16)
fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
#else
fprintf(stderr, "%s: GGML_SYCL_FP16: no\n", __func__);
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
#endif
@@ -11337,9 +11366,8 @@ void ggml_init_sycl() try {
if(id!=user_device_id) continue;
device_inx++;
int device_vmm = 0;
g_device_caps[device_inx].vmm = !!device_vmm;
g_device_caps[device_inx].vmm = 0;
g_device_caps[device_inx].device_id = id;
g_sycl_device_id2index[id].index = device_inx;
@@ -11347,18 +11375,12 @@ void ggml_init_sycl() try {
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(id))));
// fprintf(stderr,
// " Device %d: %s, compute capability %d.%d, VMM: %s\n", id,
// prop.get_name(), prop.get_major_version(),
// prop.get_minor_version(), device_vmm ? "yes" : "no");
g_tensor_split[device_inx] = total_vram;
total_vram += prop.get_global_mem_size();
g_device_caps[device_inx].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version();
// printf("g_device_caps[%d].cc=%d\n", device_inx, g_device_caps[device_inx].cc);
}
device_inx = -1;
for (int id = 0; id < g_all_sycl_device_count; ++id) {
@@ -12194,7 +12216,6 @@ inline void ggml_sycl_op_mul_mat_sycl(
// ldc == nrows of the matrix that cuBLAS writes into
int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
const int compute_capability = g_device_caps[id].cc;
#ifdef GGML_SYCL_F16
bool use_fp16 = true; // TODO(Yu) SYCL capability check
#else
@@ -12679,7 +12700,7 @@ static void ggml_sycl_set_peer_access(const int n_tokens) {
continue;
}
int can_access_peer;
// int can_access_peer;
// SYCL_CHECK(syclDeviceCanAccessPeer(&can_access_peer, id, id_other));
// if (can_access_peer) {
// if (enable_peer_access) {
@@ -12704,7 +12725,6 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t nrows0 = ggml_nrows(src0);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
@@ -13800,13 +13820,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
src1_row_extra.data_device[g_main_device_index] = src1_contiguous.get();
dst_row_extra.data_device[g_main_device_index] = dst_contiguous.get();
const dpct::memcpy_direction src1_kind =
src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device
: dpct::device_to_device;
const dpct::memcpy_direction dst_kind = dst->backend == GGML_BACKEND_CPU
? dpct::device_to_host
: dpct::device_to_device;
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
@@ -14788,6 +14801,12 @@ static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_ty
UNUSED(buft);
}
static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
return dpct::get_current_device().get_max_mem_alloc_size();
UNUSED(buft);
}
static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
int64_t row_low = 0;
int64_t row_high = ggml_nrows(tensor);
@@ -14818,7 +14837,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
/* .is_host = */ nullptr,

View File

@@ -11377,22 +11377,24 @@ struct llama_batch llama_batch_get_one(
};
}
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
} else {
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
for (int i = 0; i < n_tokens; ++i) {
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
for (int i = 0; i < n_tokens_alloc; ++i) {
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
}
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
batch.seq_id[n_tokens_alloc] = nullptr;
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
return batch;
}
@@ -11403,7 +11405,7 @@ void llama_batch_free(struct llama_batch batch) {
if (batch.pos) free(batch.pos);
if (batch.n_seq_id) free(batch.n_seq_id);
if (batch.seq_id) {
for (int i = 0; i < batch.n_tokens; ++i) {
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
free(batch.seq_id[i]);
}
free(batch.seq_id);

View File

@@ -141,6 +141,28 @@ for wt in "${wtypes[@]}"; do
wfiles+=("")
done
# map wtype input to index
if [[ ! -z "$wtype" ]]; then
iw=-1
is=0
for wt in "${wtypes[@]}"; do
# uppercase
uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
if [[ "$uwt" == "$wtype" ]]; then
iw=$is
break
fi
is=$((is+1))
done
if [[ $iw -eq -1 ]]; then
printf "[-] Invalid weight type: %s\n" "$wtype"
exit 1
fi
wtype="$iw"
fi
# sample repos
repos=(
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
@@ -252,8 +274,10 @@ for file in $model_files; do
printf " %2d) %s %s\n" $iw "$have" "$file"
done
wfile="${wfiles[$wtype]}"
# ask for weights type until provided and available
while [[ -z "$wtype" ]]; do
while [[ -z "$wfile" ]]; do
printf "\n"
read -p "[+] Select weight type: " wtype
wfile="${wfiles[$wtype]}"