fix default params for examples/main (#697 )

py: huggingface -> Hugging Face (#686 )
readme: replace termux links with homepage, play store is deprecated (#680 )
2026-05-05 16:44:11 +00:00 · 2023-04-02 04:41:12 +02:00 · 2023-04-01 18:38:18 +02:00 · 2023-04-01 16:57:30 +02:00 · 2023-04-01 16:08:40 +02:00 · 2023-03-31 19:19:16 +00:00
11 changed files with 222 additions and 186 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,9 @@ option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
 # Compile flags
 #

+set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
+set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
--- a/README.md
+++ b/README.md
@@ -37,9 +37,11 @@ Supported platforms:

 Supported models:

- [X] LLaMA
+- [X] LLaMA 🦙
 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
+- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)

 ---

@@ -153,8 +155,8 @@ python3 -m pip install torch numpy sentencepiece
 # convert the 7B model to ggml FP16 format
 python3 convert-pth-to-ggml.py models/7B/ 1

-# quantize the model to 4-bits
-python3 quantize.py 7B
+# quantize the model to 4-bits (using method 2 = q4_0)
+./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2

 # run the inference
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
@@ -299,7 +301,7 @@ And after 4.45 hours, you will have the final perplexity.

 ### Android

-You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
+You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
 First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
 ```
 $ mkdir build-android
@@ -308,7 +310,7 @@ $ export NDK=<your_ndk_directory>
 $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 $ make
 ```
-Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
+Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
 Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@@ -27,9 +27,9 @@ def read_tokens(fin, vocab_size):
        text_len = struct.unpack("i", fin.read(4))[0]
        text_bytes = fin.read(text_len)
        try:
-            text = text_bytes.decode("utf-8")
+            text = text_bytes.decode()
        except UnicodeDecodeError:
-            text = text_bytes.decode("utf-8", "replace")
+            text = text_bytes.decode(errors="replace")
        score = struct.unpack("f", fin.read(4))[0]
        tokens.append((text, score))
    return tokens
@@ -82,7 +82,7 @@ def read_variables(fin):

        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
        shape = shape[::-1]
-        name = fin.read(name_length).decode("utf-8")
+        name = fin.read(name_length).decode()

        # ensure tensor data is aligned
        tensor_data_offset = fin.tell()
@@ -199,7 +199,7 @@ def chat(model, hparams, llama_dir):
    device = torch.device("cpu")
    llama = llama.to(device)

-    ctx = """You are AI. 
+    ctx = """You are AI.
 This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
 User: Hello, AI.
 AI: Hello! How can I assist you today?
@@ -207,11 +207,11 @@ AI: Hello! How can I assist you today?
    print(ctx.rstrip("\n"))
    while True:
        print("-" * 60)
-        prompt = input(f"User: ")
+        prompt = input("User: ")
        if ctx != "":
-            ctx = ctx + "User: " + prompt + "\n"
+            ctx = f"{ctx}User: {prompt}\n"
        else:
-            ctx = prompt + "\nAI:"
+            ctx = f"{prompt}\nAI:"

        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx

@@ -236,7 +236,7 @@ AI: Hello! How can I assist you today?
                )
            s = generation_output.sequences[0]
            decoded = tokenizer.decode(s)
-            ctx = decoded + "\n"
+            ctx = f"{decoded}\n"


 def main():
@@ -254,7 +254,7 @@ def main():
    parser.add_argument(
        "--hf",
        action="store_true",
-        help="Whether to save the model in the huggingface format. (default: False)",
+        help="Whether to save the model in the Hugging Face format. (default: False)",
    )
    parser.add_argument(
        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@@ -49,7 +49,7 @@ def write_header(f_out, header):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@@ -60,13 +60,13 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))

    # TODO: GPT4All - add extra <pad> token
-    text = "<pad>".encode("utf-8")
+    text = "<pad>".encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", 0.0))
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -50,7 +50,7 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
-        text = " \u2047 ".encode("utf-8")
+        text = " \u2047 ".encode()
    elif tokenizer.is_control(i):
        text = b""
    elif tokenizer.is_byte(i):
@@ -61,13 +61,13 @@ for i in range(tokenizer.vocab_size()):
        byte_value = int(piece[3:-1], 16)
        text = struct.pack("B", byte_value)
    else:
-        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", tokenizer.get_score(i)))

 def write_header(shape, dst_name, ftype_cur):
-    sname = dst_name.encode('utf-8')
+    sname = dst_name.encode()
    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
    fout.write(sname)
@@ -80,7 +80,7 @@ def write_header(shape, dst_name, ftype_cur):
 def convert_non_q4(src_name, dst_name):
    v = model[src_name]
    shape = v.shape
-    print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
+    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
    if len(shape) == 1:
        print("  Converting to float32")
        v = v.to(torch.float32)
@@ -105,7 +105,7 @@ def convert_q4(src_name, dst_name, permute=False):
    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
    shape = (qweight.shape[0], qweight.shape[1] * 8)

-    print("Processing Q4 variable: " + src_name + " with shape: ", shape)
+    print(f"Processing Q4 variable: {src_name} with shape: {shape}")

    # The output format has the int4 weights in groups of 32 rather than 8.
    # It looks like this:
@@ -168,5 +168,5 @@ for i in range(n_layer):

 fout.close()

-print("Done. Output file: " + fname_out)
-print("")
+print(f"Done. Output file: {fname_out}")
+print()
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -120,7 +120,7 @@ def write_header(fout, hparams, ftype):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@@ -131,7 +131,7 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
@@ -191,7 +191,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):
        fullshape = list(partshape)
        if n_dims > 1:
            fullshape[split_dim] *= n_parts
-        sname = name.encode('utf-8')
+        sname = name.encode()
        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@@ -44,7 +44,7 @@ def write_header(f_out, header):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@@ -55,7 +55,7 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -39,6 +39,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {

    bool invalid_param = false;
    std::string arg;
+    gpt_params default_params;
+
    for (int i = 1; i < argc; i++) {
        arg = argv[i];

@@ -66,6 +68,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
@@ -168,7 +175,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
@@ -180,13 +187,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.input_prefix = argv[i];
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(1);
        }
    }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argc, argv, params);
+        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }

--- a/ggml.c
+++ b/ggml.c
@@ -461,6 +461,39 @@ static inline __m128i packNibbles( __m256i bytes )
    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
    return _mm_packus_epi16( r0, r1 );
 }
+#elif __AVX__
+static inline __m128i bytesFromNibbles( const uint8_t* rsi )
+{
+    // Load 8 bytes from memory
+    __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
+
+    // Expand bytes into uint16_t values
+    __m128i bytes = _mm_cvtepu8_epi16( tmp );
+
+    // Unpack values into individual bytes
+    const __m128i lowMask = _mm_set1_epi8( 0xF );
+    __m128i high = _mm_andnot_si128( lowMask, bytes );
+    __m128i low = _mm_and_si128( lowMask, bytes );
+    high = _mm_slli_epi16( high, 4 );
+    bytes = _mm_or_si128( low, high );
+    return bytes;
+}
+
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+    __m128i low = _mm_and_si128( lowByte, bytes1 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes1 = _mm_or_si128( low, high );
+    high = _mm_andnot_si128( lowByte, bytes2 );
+    low = _mm_and_si128( lowByte, bytes2 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes2 = _mm_or_si128( low, high );
+
+    return _mm_packus_epi16( bytes1, bytes2);
+}
 #endif

 // method 5
@@ -509,8 +542,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
            const uint8_t vi1 = (int8_t)roundf(v1) + 8;

-            assert(vi0 >= 0 && vi0 < 16);
-            assert(vi1 >= 0 && vi1 < 16);
+            assert(vi0 < 16);
+            assert(vi1 < 16);

            pp[l/2] = vi0 | (vi1 << 4);
        }
@@ -660,6 +693,80 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
        __m128i res = packNibbles( i0 );
        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
    }
+#elif defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 7.0f;
+        y[i].d = d;
+        const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
+        const __m128i off = _mm_set1_epi8( 8);
+        ni0 = _mm_add_epi8( ni0, off );
+        ni4 = _mm_add_epi8( ni4, off );
+
+        // Compress the vector into 4 bit/value, and store
+        __m128i res = packNibbles( ni0, ni4 );
+        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
+    }
 #elif defined(__wasm_simd128__)
    for (int i = 0; i < nb; i++) {
        float amax = 0.0f; // absolute max
@@ -730,8 +837,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
            const uint8_t vi0 = roundf(v0);
            const uint8_t vi1 = roundf(v1);

-            assert(vi0 >= 0 && vi0 < 16);
-            assert(vi1 >= 0 && vi1 < 16);
+            assert(vi0 < 16);
+            assert(vi1 < 16);

            pp[l/2] = vi0 | (vi1 << 4);
        }
@@ -1726,7 +1833,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
    const block_q4_0 * restrict x = vx;
    const block_q4_0 * restrict y = vy;

-    ggml_float sumf = 0.0;
+    float sumf = 0.0;

 #if defined(__ARM_NEON)
    float sum0 = 0.0f;
@@ -1821,7 +1928,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
 #endif
    }

-    sumf = (ggml_float)(sum0 + sum1);
+    sumf = sum0 + sum1;
 #elif defined(__AVX512F__)
    // Initialize accumulator with zeros
    __m512 acc0 = _mm512_setzero_ps();
@@ -1855,6 +1962,10 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
    __m256 acc = _mm256_setzero_ps();

    // Main loop
+    // TODO: figure a way to do this in a portable way
+    #ifdef __GNUC__
+    #pragma GCC unroll 16
+    #endif
    for (int i = 0; i < nb; ++i) {
        // Compute combined scale for the block
        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
@@ -1868,20 +1979,21 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
        bx = _mm256_sub_epi8( bx, off );
        by = _mm256_sub_epi8( by, off );

-        // Sign-extend first 16 signed bytes into int16_t
-        __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
-        __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
-        // Compute products of int16_t integers, add pairwise
-        __m256i i32 = _mm256_madd_epi16( x16, y16 );
+        // Get absolute values of x vectors
+        const __m256i ax = _mm256_sign_epi8(bx, bx);

-        // Sign-extend last 16 signed bytes into int16_t vectors
-        x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
-        y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
-        // Accumulate products of int16_t integers
-        i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );
+        // Sign the values of the y vectors
+        const __m256i sy = _mm256_sign_epi8(by, bx);
+
+        // Perform multiplication and create 16-bit values
+        const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+
+        const __m256i ones = _mm256_set1_epi16(1);
+        const __m256i i32 = _mm256_madd_epi16(ones, dot);

        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps( i32 );
+        const __m256 p = _mm256_cvtepi32_ps( i32 );
+
        // Apply the scale, and accumulate
        acc = _mm256_fmadd_ps( d, p, acc );
    }
@@ -1892,6 +2004,52 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );

+    sumf = _mm_cvtss_f32( res );
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
+
+        __m128i i32[2];
+        for (int j = 0; j < 2; ++j) {
+            // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
+            __m128i bx = bytesFromNibbles( x[i].qs + 8*j );
+            __m128i by = bytesFromNibbles( y[i].qs + 8*j );
+
+            // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+            const __m128i off = _mm_set1_epi8( 8 );
+            bx = _mm_sub_epi8( bx, off );
+            by = _mm_sub_epi8( by, off );
+
+	    // Get absolute values of x vectors
+            const __m128i ax = _mm_sign_epi8(bx, bx);
+
+            // Sign the values of the y vectors
+            const __m128i sy = _mm_sign_epi8(by, bx);
+
+            // Perform multiplication and create 16-bit values
+            const __m128i dot = _mm_maddubs_epi16(ax, sy);
+
+            const __m128i ones = _mm_set1_epi16(1);
+            i32[j] = _mm_madd_epi16(ones, dot);
+        }
+
+        // Convert int32_t to float
+        __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
+        // Apply the scale, and accumulate
+        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
+    }
+
+    // Return horizontal sum of the acc vector
+    __m128 res = _mm256_extractf128_ps( acc, 1 );
+    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
+    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
+    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
+
    sumf = _mm_cvtss_f32( res );
 #elif defined(__wasm_simd128__)
    // wasm simd
--- a/migrate-ggml-2023-03-30-pr613.py
+++ b/migrate-ggml-2023-03-30-pr613.py
@@ -272,13 +272,11 @@ def main():
        tokens = read_tokens(fin, hparams)

    if hparams['magic'] == 0x67676a74:  # ggjt
-        print("%s: input ggml has already been converted to 'ggjt' magic\n" %
-              (args.fin_path))
+        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
        sys.exit(1)

    if hparams['magic'] != 0x67676d66:  # ggmf
-        print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" %
-              (args.fin_path, hparams['magic']))
+        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
        sys.exit(1)

    hparams['magic'] = 0x67676a74  # ggjt
@@ -286,7 +284,7 @@ def main():
    # count number of multipart files by convention
    n_parts = 1
    while True:
-        if os.path.exists("%s.%d" % (args.fin_path, n_parts)):
+        if os.path.exists(f"{args.fin_path}.{n_parts}"):
            n_parts += 1
        else:
            break
@@ -302,7 +300,7 @@ def main():
            print(f"Processing part {part_id+1} of {n_parts}\n")
            fin_path = args.fin_path
            if part_id > 0:
-                fin_path += ".%d" % (part_id)
+                fin_path += f".{part_id}"
            with open(fin_path, "rb") as fin:
                read_tokens(fin, read_hparams(fin))
                copy_tensors(fin, fout, part_id, n_parts)
--- a/quantize.py
+++ b/quantize.py
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-
-"""Script to execute the "quantize" script on a given set of models."""
-
-import subprocess
-import argparse
-import glob
-import sys
-import os
-
-
-def main():
-    """Update the quantize binary name depending on the platform and parse
-    the command line arguments and execute the script.
-    """
-
-    if "linux" in sys.platform or "darwin" in sys.platform:
-        quantize_script_binary = "quantize"
-
-    elif "win32" in sys.platform or "cygwin" in sys.platform:
-        quantize_script_binary = "quantize.exe"
-
-    else:
-        print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
-        quantize_script_binary = "quantize"
-
-    parser = argparse.ArgumentParser(
-        prog='python3 quantize.py',
-        description='This script quantizes the given models by applying the '
-        f'"{quantize_script_binary}" script on them.'
-    )
-    parser.add_argument(
-        'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
-        help='The models to quantize.'
-    )
-    parser.add_argument(
-        '-r', '--remove-16', action='store_true', dest='remove_f16',
-        help='Remove the f16 model after quantizing it.'
-    )
-    parser.add_argument(
-        '-m', '--models-path', dest='models_path',
-        default=os.path.join(os.getcwd(), "models"),
-        help='Specify the directory where the models are located.'
-    )
-    parser.add_argument(
-        '-q', '--quantize-script-path', dest='quantize_script_path',
-        default=os.path.join(os.getcwd(), quantize_script_binary),
-        help='Specify the path to the "quantize" script.'
-    )
-
-    # TODO: Revise this code
-    # parser.add_argument(
-    #     '-t', '--threads', dest='threads', type='int',
-    #     default=os.cpu_count(),
-    #     help='Specify the number of threads to use to quantize many models at '
-    #     'once. Defaults to os.cpu_count().'
-    # )
-
-    args = parser.parse_args()
-    args.models_path = os.path.abspath(args.models_path)
-
-    if not os.path.isfile(args.quantize_script_path):
-        print(
-            f'The "{quantize_script_binary}" script was not found in the '
-            "current location.\nIf you want to use it from another location, "
-            "set the --quantize-script-path argument from the command line."
-        )
-        sys.exit(1)
-
-    for model in args.models:
-        # The model is separated in various parts
-        # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
-        f16_model_path_base = os.path.join(
-            args.models_path, model, "ggml-model-f16.bin"
-        )
-
-        if not os.path.isfile(f16_model_path_base):
-            print(f'The file %s was not found' % f16_model_path_base)
-            sys.exit(1)
-
-        f16_model_parts_paths = map(
-            lambda filename: os.path.join(f16_model_path_base, filename),
-            glob.glob(f"{f16_model_path_base}*")
-        )
-
-        for f16_model_part_path in f16_model_parts_paths:
-            if not os.path.isfile(f16_model_part_path):
-                print(
-                    f"The f16 model {os.path.basename(f16_model_part_path)} "
-                    f"was not found in {args.models_path}{os.path.sep}{model}"
-                    ". If you want to use it from another location, set the "
-                    "--models-path argument from the command line."
-                )
-                sys.exit(1)
-
-            __run_quantize_script(
-                args.quantize_script_path, f16_model_part_path
-            )
-
-            if args.remove_f16:
-                os.remove(f16_model_part_path)
-
-
-# This was extracted to a top-level function for parallelization, if
-# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
-
-def __run_quantize_script(script_path, f16_model_part_path):
-    """Run the quantize script specifying the path to it and the path to the
-    f16 model to quantize.
-    """
-
-    new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
-    subprocess.run(
-        [script_path, f16_model_part_path, new_quantized_model_path, "2"],
-        check=True
-    )
-
-
-if __name__ == "__main__":
-    try:
-        main()
-
-    except subprocess.CalledProcessError:
-        print("\nAn error ocurred while trying to quantize the models.")
-        sys.exit(1)
-
-    except KeyboardInterrupt:
-        sys.exit(0)
-
-    else:
-        print("\nSuccesfully quantized all models.")
Author	SHA1	Message	Date
Murilo Santana	5b70e7de4c	fix default params for examples/main (#697 )	2023-04-02 04:41:12 +02:00
Ikko Eltociear Ashimine	a717cba844	py: huggingface -> Hugging Face (#686 )	2023-04-01 18:38:18 +02:00
rimoliga	d0a7f742e7	readme: replace termux links with homepage, play store is deprecated (#680 )	2023-04-01 16:57:30 +02:00
Slaren	0d054e292e	Show error message when -f fails	2023-04-01 16:08:40 +02:00
Stephan Walter	3525899277	Enable -std= for cmake builds, fix warnings (#598 )	2023-03-31 19:19:16 +00:00
slaren	1d08882afa	Optimize AVX2 ggml_vec_dot_q4_0 (#642 )	2023-03-31 15:55:52 +00:00
perserk	02c5b27e91	Add AVX acceleration (#617 ) * ggml : add AVX quantize_row_q4_0() * ggml : add AVX ggml_vec_dot_q4_0() * ggml : refactor AVX part of ggml_vec_dot_q4_0() https://github.com/ggerganov/llama.cpp/pull/617#issuecomment-1489985645	2023-03-31 13:55:44 +02:00
Pavol Rusnak	cbef542879	py : cleanup the code - use f-strings where possible - drop first param of encode/decode functions since "utf-8" is the default	2023-03-31 10:32:01 +02:00
Pavol Rusnak	9733104be5	drop quantize.py (now that models are using a single file)	2023-03-31 01:07:32 +02:00
Georgi Gerganov	3df890aef4	readme : update supported models	2023-03-30 22:31:54 +03:00