Add mmap pages stats (disabled by default)

Stats can be enabled by changing #if 0 to #if 1. Stats are printed at the end, when mmap block is being unmapped, but the print_pages_stats can be called at any time between mmap and munmap.
stdout : vertical align outputs for better readibility
2026-05-06 17:14:07 +00:00 · 2023-04-16 18:22:30 +02:00 · 2023-04-16 13:59:27 +03:00 · 2023-04-16 10:13:00 +00:00 · 2023-04-16 11:13:42 +02:00 · 2023-04-15 23:53:21 +02:00
7 changed files with 66 additions and 16 deletions
--- a/convert.py
+++ b/convert.py
@@ -735,7 +735,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
    # Use mmap for the actual data to avoid race conditions with the file offset.
    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
-    byte_buf = mapped[fp.tell():]
+    byte_buf = mapped[8 + header_size:]

    def convert(info: Dict[str, Any]) -> LazyTensor:
        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
@@ -761,7 +761,7 @@ def must_read(fp: IO[bytes], length: int) -> bytes:
    return ret


-def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus:
+def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
    magic = must_read(fp, 4)[::-1]
    if magic in (b'ggmf', b'ggjt'):
        version, = struct.unpack("i", must_read(fp, 4))
@@ -795,7 +795,9 @@ def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus:

    model: LazyModel = {}
    # Use mmap for the actual data to avoid race conditions with the file offset.
+    off = fp.raw.tell()
    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    fp.raw.seek(off) # needed on Windows

    def read_tensor() -> None:  # this is a function so that variables captured in `load` don't change
        shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
@@ -949,8 +951,9 @@ class OutputFile:

        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
-            size = ' x '.join(map(str, lazy_tensor.shape))
-            print(f"[{i+1}/{len(model)}] Writing tensor {name}, size {size}...")
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
            of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
            ndarray.tofile(of.fout)
        of.fout.close()
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,6 +1,8 @@
 #include "common.h"
 #include "llama.h"

+#include <ctime>
+
 int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -11,6 +11,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2,6 +2,7 @@
 #include "llama.h"

 #include <cmath>
+#include <ctime>

 std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
--- a/ggml.c
+++ b/ggml.c
@@ -2373,11 +2373,11 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
            const uint8_t v0 = p0[j];
            const uint8_t v1 = p1[j];

-            const int8_t i0 = (int8_t) (v0 & 0xf) - 8;
-            const int8_t i1 = (int8_t) (v0 >> 4)  - 8;
+            const int i0 = (v0 & 0xf) - 8;
+            const int i1 = (v0 >> 4)  - 8;

-            const int8_t i2 = (int8_t) (v1 & 0xf) - 8;
-            const int8_t i3 = (int8_t) (v1 >> 4)  - 8;
+            const int i2 = (v1 & 0xf) - 8;
+            const int i3 = (v1 >> 4)  - 8;

            sumi += i0*i2 + i1*i3;
        }
--- a/llama.cpp
+++ b/llama.cpp
@@ -9,6 +9,7 @@
 #include "ggml.h"

 #include <array>
+#include <ctime>
 #include <cinttypes>
 #include <fstream>
 #include <random>
@@ -261,12 +262,12 @@ static size_t checked_div(size_t a, size_t b) {
 }

 static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
-    std::string ret = "[" + std::to_string(ne.at(0));
+    char buf[256];
+    snprintf(buf, sizeof(buf), "%5u", ne.at(0));
    for (size_t i = 1; i < ne.size(); i++) {
-        ret += " x " + std::to_string(ne.at(i));
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
    }
-    ret += "]";
-    return ret;
+    return buf;
 }

 static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
@@ -941,8 +942,8 @@ static void llama_model_load_internal(
        ml->ggml_ctx = ctx;

        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
-        model.norm   = ml->get_tensor("norm.weight", {n_embd});
-        model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
+        model.norm           = ml->get_tensor("norm.weight",           {n_embd});
+        model.output         = ml->get_tensor("output.weight",         {n_embd, n_vocab});

        model.layers.resize(n_layer);
        for (uint32_t i = 0; i < n_layer; ++i) {
@@ -1569,7 +1570,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        tensor.data = read_data.addr;
        model_loader->load_data_for(tensor);

-        printf("[%zu/%zu] %36s - %s, type = %6s, ",
+        printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
               ++idx, model_loader->tensors_map.tensors.size(),
               tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
               ggml_type_name(tensor.type));
--- a/llama_util.h
+++ b/llama_util.h
@@ -43,8 +43,12 @@
    } while (0)

 #ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
 __attribute__((format(printf, 1, 2)))
 #endif
+#endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
-};
+}

 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
@@ -184,7 +188,45 @@ struct llama_mmap {
        }
    }

+#if 0
+    void print_pages_stats() {
+        long sz = sysconf(_SC_PAGESIZE);
+        fprintf(stderr, "\n");
+        fprintf(stderr, "mmap pages stats (page size = %ld):\n", sz);
+        int pages = size / sz;
+        char *vec = (char *)malloc(pages);
+        if (mincore(addr, size, vec) != 0) {
+            fprintf(stderr, "mincore failed: %s\n", strerror(errno));
+        }
+        int resident = 0, nonresident = 0;
+        for (int i = 0; i < pages; i++) {
+            if (i % 80 == 0) {
+                fprintf(stderr, "%08d: ", i);
+            }
+            fprintf(stderr, vec[i] & 1 ? "x" : ".");
+            if (vec[i] & 1) {
+                resident++;
+            } else {
+                nonresident++;
+            }
+            if (i % 80 == (80 - 1)) {
+                fprintf(stderr, "\n");
+            }
+        }
+        fprintf(stderr, "\n");
+        fprintf(stderr, "pages resident     : %d (%0.2f%%)\n", resident, 100.0f * resident / pages);
+        fprintf(stderr, "pages non-resident : %d (%0.2f%%)\n", nonresident, 100.0f * nonresident / pages);
+        fprintf(stderr, "pages total        : %d\n", pages);
+        fprintf(stderr, "\n");
+        free(vec);
+    }
+#endif
+
    ~llama_mmap() {
+// set to 1 to print the mmaped pages stats
+#if 0
+        print_pages_stats();
+#endif
        munmap(addr, size);
    }
 #elif defined(_WIN32)
Author	SHA1	Message	Date
Pavol Rusnak	1506737499	Add mmap pages stats (disabled by default) Stats can be enabled by changing #if 0 to #if 1. Stats are printed at the end, when mmap block is being unmapped, but the print_pages_stats can be called at any time between mmap and munmap.	2023-04-16 18:22:30 +02:00
Georgi Gerganov	3173a62eb9	stdout : vertical align outputs for better readibility	2023-04-16 13:59:27 +03:00
Pavol Rusnak	489537e6cf	examples: add missing <ctime> include for time() (#1011 )	2023-04-16 10:13:00 +00:00
nanahi	2d3481c721	Fix msys2 build error and warnings (#1009 )	2023-04-16 11:13:42 +02:00
comex	74f5899df4	convert.py: Fix loading safetensors and ggml format on Windows (#991 ) Calling `mmap.mmap` on Windows apparently resets the file offset of the raw file object (and makes the BufferedReader return a negative file offset). For safetensors, avoid using the file offset after calling mmap. For GGML format, explicitly save and restore the offset. Fixes #966.	2023-04-15 23:53:21 +02:00
Stephan Walter	2f7c8e014e	Fix potential int8 overflow in non-SIMD vec_dot (#986 )	2023-04-15 18:28:56 +00:00