mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-05 16:44:11 +00:00
Compare commits
10 Commits
master-ee0
...
master-5b7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5b70e7de4c | ||
|
|
a717cba844 | ||
|
|
d0a7f742e7 | ||
|
|
0d054e292e | ||
|
|
3525899277 | ||
|
|
1d08882afa | ||
|
|
02c5b27e91 | ||
|
|
cbef542879 | ||
|
|
9733104be5 | ||
|
|
3df890aef4 |
@@ -68,7 +68,9 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
# Compile flags
|
||||
#
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
12
README.md
12
README.md
@@ -37,9 +37,11 @@ Supported platforms:
|
||||
|
||||
Supported models:
|
||||
|
||||
- [X] LLaMA
|
||||
- [X] LLaMA 🦙
|
||||
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
||||
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
|
||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||
|
||||
---
|
||||
|
||||
@@ -153,8 +155,8 @@ python3 -m pip install torch numpy sentencepiece
|
||||
# convert the 7B model to ggml FP16 format
|
||||
python3 convert-pth-to-ggml.py models/7B/ 1
|
||||
|
||||
# quantize the model to 4-bits
|
||||
python3 quantize.py 7B
|
||||
# quantize the model to 4-bits (using method 2 = q4_0)
|
||||
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
|
||||
|
||||
# run the inference
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||
@@ -299,7 +301,7 @@ And after 4.45 hours, you will have the final perplexity.
|
||||
|
||||
### Android
|
||||
|
||||
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
||||
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
|
||||
First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
|
||||
```
|
||||
$ mkdir build-android
|
||||
@@ -308,7 +310,7 @@ $ export NDK=<your_ndk_directory>
|
||||
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||
$ make
|
||||
```
|
||||
Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
|
||||
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
|
||||
Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:
|
||||
|
||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||
|
||||
@@ -27,9 +27,9 @@ def read_tokens(fin, vocab_size):
|
||||
text_len = struct.unpack("i", fin.read(4))[0]
|
||||
text_bytes = fin.read(text_len)
|
||||
try:
|
||||
text = text_bytes.decode("utf-8")
|
||||
text = text_bytes.decode()
|
||||
except UnicodeDecodeError:
|
||||
text = text_bytes.decode("utf-8", "replace")
|
||||
text = text_bytes.decode(errors="replace")
|
||||
score = struct.unpack("f", fin.read(4))[0]
|
||||
tokens.append((text, score))
|
||||
return tokens
|
||||
@@ -82,7 +82,7 @@ def read_variables(fin):
|
||||
|
||||
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
|
||||
shape = shape[::-1]
|
||||
name = fin.read(name_length).decode("utf-8")
|
||||
name = fin.read(name_length).decode()
|
||||
|
||||
# ensure tensor data is aligned
|
||||
tensor_data_offset = fin.tell()
|
||||
@@ -199,7 +199,7 @@ def chat(model, hparams, llama_dir):
|
||||
device = torch.device("cpu")
|
||||
llama = llama.to(device)
|
||||
|
||||
ctx = """You are AI.
|
||||
ctx = """You are AI.
|
||||
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
|
||||
User: Hello, AI.
|
||||
AI: Hello! How can I assist you today?
|
||||
@@ -207,11 +207,11 @@ AI: Hello! How can I assist you today?
|
||||
print(ctx.rstrip("\n"))
|
||||
while True:
|
||||
print("-" * 60)
|
||||
prompt = input(f"User: ")
|
||||
prompt = input("User: ")
|
||||
if ctx != "":
|
||||
ctx = ctx + "User: " + prompt + "\n"
|
||||
ctx = f"{ctx}User: {prompt}\n"
|
||||
else:
|
||||
ctx = prompt + "\nAI:"
|
||||
ctx = f"{prompt}\nAI:"
|
||||
|
||||
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
|
||||
|
||||
@@ -236,7 +236,7 @@ AI: Hello! How can I assist you today?
|
||||
)
|
||||
s = generation_output.sequences[0]
|
||||
decoded = tokenizer.decode(s)
|
||||
ctx = decoded + "\n"
|
||||
ctx = f"{decoded}\n"
|
||||
|
||||
|
||||
def main():
|
||||
@@ -254,7 +254,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--hf",
|
||||
action="store_true",
|
||||
help="Whether to save the model in the huggingface format. (default: False)",
|
||||
help="Whether to save the model in the Hugging Face format. (default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
|
||||
|
||||
@@ -49,7 +49,7 @@ def write_header(f_out, header):
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
@@ -60,13 +60,13 @@ def write_tokens(fout, tokenizer):
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
# TODO: GPT4All - add extra <pad> token
|
||||
text = "<pad>".encode("utf-8")
|
||||
text = "<pad>".encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", 0.0))
|
||||
|
||||
@@ -50,7 +50,7 @@ fout.write(struct.pack("i", 4))
|
||||
# This loop unchanged from convert-pth-to-ggml.py:
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
@@ -61,13 +61,13 @@ for i in range(tokenizer.vocab_size()):
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def write_header(shape, dst_name, ftype_cur):
|
||||
sname = dst_name.encode('utf-8')
|
||||
sname = dst_name.encode()
|
||||
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
|
||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||
fout.write(sname)
|
||||
@@ -80,7 +80,7 @@ def write_header(shape, dst_name, ftype_cur):
|
||||
def convert_non_q4(src_name, dst_name):
|
||||
v = model[src_name]
|
||||
shape = v.shape
|
||||
print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
|
||||
print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
|
||||
if len(shape) == 1:
|
||||
print(" Converting to float32")
|
||||
v = v.to(torch.float32)
|
||||
@@ -105,7 +105,7 @@ def convert_q4(src_name, dst_name, permute=False):
|
||||
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
|
||||
shape = (qweight.shape[0], qweight.shape[1] * 8)
|
||||
|
||||
print("Processing Q4 variable: " + src_name + " with shape: ", shape)
|
||||
print(f"Processing Q4 variable: {src_name} with shape: {shape}")
|
||||
|
||||
# The output format has the int4 weights in groups of 32 rather than 8.
|
||||
# It looks like this:
|
||||
@@ -168,5 +168,5 @@ for i in range(n_layer):
|
||||
|
||||
fout.close()
|
||||
|
||||
print("Done. Output file: " + fname_out)
|
||||
print("")
|
||||
print(f"Done. Output file: {fname_out}")
|
||||
print()
|
||||
|
||||
@@ -120,7 +120,7 @@ def write_header(fout, hparams, ftype):
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
@@ -131,7 +131,7 @@ def write_tokens(fout, tokenizer):
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
@@ -191,7 +191,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):
|
||||
fullshape = list(partshape)
|
||||
if n_dims > 1:
|
||||
fullshape[split_dim] *= n_parts
|
||||
sname = name.encode('utf-8')
|
||||
sname = name.encode()
|
||||
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
|
||||
for dim in reversed(fullshape):
|
||||
fout.write(struct.pack("i", dim))
|
||||
|
||||
@@ -44,7 +44,7 @@ def write_header(f_out, header):
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
@@ -55,7 +55,7 @@ def write_tokens(fout, tokenizer):
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
@@ -39,6 +39,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
gpt_params default_params;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
@@ -66,6 +68,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
std::ifstream file(argv[i]);
|
||||
if (!file) {
|
||||
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||
if (params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
@@ -168,7 +175,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
params.n_parts = std::stoi(argv[i]);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(0);
|
||||
} else if (arg == "--random-prompt") {
|
||||
params.random_prompt = true;
|
||||
@@ -180,13 +187,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.input_prefix = argv[i];
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
192
ggml.c
192
ggml.c
@@ -461,6 +461,39 @@ static inline __m128i packNibbles( __m256i bytes )
|
||||
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
|
||||
return _mm_packus_epi16( r0, r1 );
|
||||
}
|
||||
#elif __AVX__
|
||||
static inline __m128i bytesFromNibbles( const uint8_t* rsi )
|
||||
{
|
||||
// Load 8 bytes from memory
|
||||
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
|
||||
|
||||
// Expand bytes into uint16_t values
|
||||
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
||||
|
||||
// Unpack values into individual bytes
|
||||
const __m128i lowMask = _mm_set1_epi8( 0xF );
|
||||
__m128i high = _mm_andnot_si128( lowMask, bytes );
|
||||
__m128i low = _mm_and_si128( lowMask, bytes );
|
||||
high = _mm_slli_epi16( high, 4 );
|
||||
bytes = _mm_or_si128( low, high );
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
||||
{
|
||||
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
||||
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
||||
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
||||
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes1 = _mm_or_si128( low, high );
|
||||
high = _mm_andnot_si128( lowByte, bytes2 );
|
||||
low = _mm_and_si128( lowByte, bytes2 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes2 = _mm_or_si128( low, high );
|
||||
|
||||
return _mm_packus_epi16( bytes1, bytes2);
|
||||
}
|
||||
#endif
|
||||
|
||||
// method 5
|
||||
@@ -509,8 +542,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
||||
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
|
||||
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
|
||||
|
||||
assert(vi0 >= 0 && vi0 < 16);
|
||||
assert(vi1 >= 0 && vi1 < 16);
|
||||
assert(vi0 < 16);
|
||||
assert(vi1 < 16);
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
@@ -660,6 +693,80 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
||||
__m128i res = packNibbles( i0 );
|
||||
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
||||
}
|
||||
#elif defined(__AVX__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Load elements into 4 AVX vectors
|
||||
__m256 v0 = _mm256_loadu_ps( x );
|
||||
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
||||
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
||||
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
||||
x += 32;
|
||||
|
||||
// Compute max(abs(e)) for the block
|
||||
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
||||
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
||||
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
||||
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
||||
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
||||
|
||||
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
||||
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
||||
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
||||
const float maxScalar = _mm_cvtss_f32( max4 );
|
||||
|
||||
// Quantize these floats
|
||||
const float d = maxScalar / 7.0f;
|
||||
y[i].d = d;
|
||||
const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
|
||||
const __m256 mul = _mm256_set1_ps( id );
|
||||
|
||||
// Apply the multiplier
|
||||
v0 = _mm256_mul_ps( v0, mul );
|
||||
v1 = _mm256_mul_ps( v1, mul );
|
||||
v2 = _mm256_mul_ps( v2, mul );
|
||||
v3 = _mm256_mul_ps( v3, mul );
|
||||
|
||||
// Round to nearest integer
|
||||
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
||||
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
||||
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
||||
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
||||
|
||||
// Convert floats to integers
|
||||
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
||||
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
||||
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
||||
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
||||
|
||||
// Since we don't have in AVX some necessary functions,
|
||||
// we split the registers in half and call AVX2 analogs from SSE
|
||||
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
||||
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
||||
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
||||
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
||||
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
||||
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
||||
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
||||
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
||||
|
||||
// Convert int32 to int16
|
||||
ni0 = _mm_packs_epi32( ni0, ni1 );
|
||||
ni2 = _mm_packs_epi32( ni2, ni3 );
|
||||
ni4 = _mm_packs_epi32( ni4, ni5 );
|
||||
ni6 = _mm_packs_epi32( ni6, ni7 );
|
||||
// Convert int16 to int8
|
||||
ni0 = _mm_packs_epi16( ni0, ni2 );
|
||||
ni4 = _mm_packs_epi16( ni4, ni6 );
|
||||
|
||||
// Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
|
||||
const __m128i off = _mm_set1_epi8( 8);
|
||||
ni0 = _mm_add_epi8( ni0, off );
|
||||
ni4 = _mm_add_epi8( ni4, off );
|
||||
|
||||
// Compress the vector into 4 bit/value, and store
|
||||
__m128i res = packNibbles( ni0, ni4 );
|
||||
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
||||
}
|
||||
#elif defined(__wasm_simd128__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
@@ -730,8 +837,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
|
||||
const uint8_t vi0 = roundf(v0);
|
||||
const uint8_t vi1 = roundf(v1);
|
||||
|
||||
assert(vi0 >= 0 && vi0 < 16);
|
||||
assert(vi1 >= 0 && vi1 < 16);
|
||||
assert(vi0 < 16);
|
||||
assert(vi1 < 16);
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
@@ -1726,7 +1833,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||
const block_q4_0 * restrict x = vx;
|
||||
const block_q4_0 * restrict y = vy;
|
||||
|
||||
ggml_float sumf = 0.0;
|
||||
float sumf = 0.0;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float sum0 = 0.0f;
|
||||
@@ -1821,7 +1928,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||
#endif
|
||||
}
|
||||
|
||||
sumf = (ggml_float)(sum0 + sum1);
|
||||
sumf = sum0 + sum1;
|
||||
#elif defined(__AVX512F__)
|
||||
// Initialize accumulator with zeros
|
||||
__m512 acc0 = _mm512_setzero_ps();
|
||||
@@ -1855,6 +1962,10 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
// Main loop
|
||||
// TODO: figure a way to do this in a portable way
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC unroll 16
|
||||
#endif
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
||||
@@ -1868,20 +1979,21 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||
bx = _mm256_sub_epi8( bx, off );
|
||||
by = _mm256_sub_epi8( by, off );
|
||||
|
||||
// Sign-extend first 16 signed bytes into int16_t
|
||||
__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
|
||||
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
|
||||
// Compute products of int16_t integers, add pairwise
|
||||
__m256i i32 = _mm256_madd_epi16( x16, y16 );
|
||||
// Get absolute values of x vectors
|
||||
const __m256i ax = _mm256_sign_epi8(bx, bx);
|
||||
|
||||
// Sign-extend last 16 signed bytes into int16_t vectors
|
||||
x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
|
||||
y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
|
||||
// Accumulate products of int16_t integers
|
||||
i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );
|
||||
// Sign the values of the y vectors
|
||||
const __m256i sy = _mm256_sign_epi8(by, bx);
|
||||
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
||||
|
||||
const __m256i ones = _mm256_set1_epi16(1);
|
||||
const __m256i i32 = _mm256_madd_epi16(ones, dot);
|
||||
|
||||
// Convert int32_t to float
|
||||
__m256 p = _mm256_cvtepi32_ps( i32 );
|
||||
const __m256 p = _mm256_cvtepi32_ps( i32 );
|
||||
|
||||
// Apply the scale, and accumulate
|
||||
acc = _mm256_fmadd_ps( d, p, acc );
|
||||
}
|
||||
@@ -1892,6 +2004,52 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
||||
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
||||
|
||||
sumf = _mm_cvtss_f32( res );
|
||||
#elif defined(__AVX__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
// Main loop
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
||||
|
||||
__m128i i32[2];
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
|
||||
__m128i bx = bytesFromNibbles( x[i].qs + 8*j );
|
||||
__m128i by = bytesFromNibbles( y[i].qs + 8*j );
|
||||
|
||||
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
||||
const __m128i off = _mm_set1_epi8( 8 );
|
||||
bx = _mm_sub_epi8( bx, off );
|
||||
by = _mm_sub_epi8( by, off );
|
||||
|
||||
// Get absolute values of x vectors
|
||||
const __m128i ax = _mm_sign_epi8(bx, bx);
|
||||
|
||||
// Sign the values of the y vectors
|
||||
const __m128i sy = _mm_sign_epi8(by, bx);
|
||||
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m128i dot = _mm_maddubs_epi16(ax, sy);
|
||||
|
||||
const __m128i ones = _mm_set1_epi16(1);
|
||||
i32[j] = _mm_madd_epi16(ones, dot);
|
||||
}
|
||||
|
||||
// Convert int32_t to float
|
||||
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
|
||||
// Apply the scale, and accumulate
|
||||
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
||||
}
|
||||
|
||||
// Return horizontal sum of the acc vector
|
||||
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
||||
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
||||
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
||||
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
||||
|
||||
sumf = _mm_cvtss_f32( res );
|
||||
#elif defined(__wasm_simd128__)
|
||||
// wasm simd
|
||||
|
||||
@@ -272,13 +272,11 @@ def main():
|
||||
tokens = read_tokens(fin, hparams)
|
||||
|
||||
if hparams['magic'] == 0x67676a74: # ggjt
|
||||
print("%s: input ggml has already been converted to 'ggjt' magic\n" %
|
||||
(args.fin_path))
|
||||
print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
|
||||
sys.exit(1)
|
||||
|
||||
if hparams['magic'] != 0x67676d66: # ggmf
|
||||
print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" %
|
||||
(args.fin_path, hparams['magic']))
|
||||
print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
|
||||
sys.exit(1)
|
||||
|
||||
hparams['magic'] = 0x67676a74 # ggjt
|
||||
@@ -286,7 +284,7 @@ def main():
|
||||
# count number of multipart files by convention
|
||||
n_parts = 1
|
||||
while True:
|
||||
if os.path.exists("%s.%d" % (args.fin_path, n_parts)):
|
||||
if os.path.exists(f"{args.fin_path}.{n_parts}"):
|
||||
n_parts += 1
|
||||
else:
|
||||
break
|
||||
@@ -302,7 +300,7 @@ def main():
|
||||
print(f"Processing part {part_id+1} of {n_parts}\n")
|
||||
fin_path = args.fin_path
|
||||
if part_id > 0:
|
||||
fin_path += ".%d" % (part_id)
|
||||
fin_path += f".{part_id}"
|
||||
with open(fin_path, "rb") as fin:
|
||||
read_tokens(fin, read_hparams(fin))
|
||||
copy_tensors(fin, fout, part_id, n_parts)
|
||||
|
||||
131
quantize.py
131
quantize.py
@@ -1,131 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""Script to execute the "quantize" script on a given set of models."""
|
||||
|
||||
import subprocess
|
||||
import argparse
|
||||
import glob
|
||||
import sys
|
||||
import os
|
||||
|
||||
|
||||
def main():
|
||||
"""Update the quantize binary name depending on the platform and parse
|
||||
the command line arguments and execute the script.
|
||||
"""
|
||||
|
||||
if "linux" in sys.platform or "darwin" in sys.platform:
|
||||
quantize_script_binary = "quantize"
|
||||
|
||||
elif "win32" in sys.platform or "cygwin" in sys.platform:
|
||||
quantize_script_binary = "quantize.exe"
|
||||
|
||||
else:
|
||||
print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
|
||||
quantize_script_binary = "quantize"
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='python3 quantize.py',
|
||||
description='This script quantizes the given models by applying the '
|
||||
f'"{quantize_script_binary}" script on them.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
|
||||
help='The models to quantize.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-r', '--remove-16', action='store_true', dest='remove_f16',
|
||||
help='Remove the f16 model after quantizing it.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-m', '--models-path', dest='models_path',
|
||||
default=os.path.join(os.getcwd(), "models"),
|
||||
help='Specify the directory where the models are located.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-q', '--quantize-script-path', dest='quantize_script_path',
|
||||
default=os.path.join(os.getcwd(), quantize_script_binary),
|
||||
help='Specify the path to the "quantize" script.'
|
||||
)
|
||||
|
||||
# TODO: Revise this code
|
||||
# parser.add_argument(
|
||||
# '-t', '--threads', dest='threads', type='int',
|
||||
# default=os.cpu_count(),
|
||||
# help='Specify the number of threads to use to quantize many models at '
|
||||
# 'once. Defaults to os.cpu_count().'
|
||||
# )
|
||||
|
||||
args = parser.parse_args()
|
||||
args.models_path = os.path.abspath(args.models_path)
|
||||
|
||||
if not os.path.isfile(args.quantize_script_path):
|
||||
print(
|
||||
f'The "{quantize_script_binary}" script was not found in the '
|
||||
"current location.\nIf you want to use it from another location, "
|
||||
"set the --quantize-script-path argument from the command line."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
for model in args.models:
|
||||
# The model is separated in various parts
|
||||
# (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
|
||||
f16_model_path_base = os.path.join(
|
||||
args.models_path, model, "ggml-model-f16.bin"
|
||||
)
|
||||
|
||||
if not os.path.isfile(f16_model_path_base):
|
||||
print(f'The file %s was not found' % f16_model_path_base)
|
||||
sys.exit(1)
|
||||
|
||||
f16_model_parts_paths = map(
|
||||
lambda filename: os.path.join(f16_model_path_base, filename),
|
||||
glob.glob(f"{f16_model_path_base}*")
|
||||
)
|
||||
|
||||
for f16_model_part_path in f16_model_parts_paths:
|
||||
if not os.path.isfile(f16_model_part_path):
|
||||
print(
|
||||
f"The f16 model {os.path.basename(f16_model_part_path)} "
|
||||
f"was not found in {args.models_path}{os.path.sep}{model}"
|
||||
". If you want to use it from another location, set the "
|
||||
"--models-path argument from the command line."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
__run_quantize_script(
|
||||
args.quantize_script_path, f16_model_part_path
|
||||
)
|
||||
|
||||
if args.remove_f16:
|
||||
os.remove(f16_model_part_path)
|
||||
|
||||
|
||||
# This was extracted to a top-level function for parallelization, if
|
||||
# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
|
||||
|
||||
def __run_quantize_script(script_path, f16_model_part_path):
|
||||
"""Run the quantize script specifying the path to it and the path to the
|
||||
f16 model to quantize.
|
||||
"""
|
||||
|
||||
new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
|
||||
subprocess.run(
|
||||
[script_path, f16_model_part_path, new_quantized_model_path, "2"],
|
||||
check=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
|
||||
except subprocess.CalledProcessError:
|
||||
print("\nAn error ocurred while trying to quantize the models.")
|
||||
sys.exit(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
|
||||
else:
|
||||
print("\nSuccesfully quantized all models.")
|
||||
Reference in New Issue
Block a user