model : add Jina Embeddings v5 Nano (partial EuroBERT) support (#19826)

* WIP: Add EuroBERT support with autoformatting changes This commit includes: - EuroBERT model implementation for GGUF conversion - C++ backend support for EuroBERT architecture - Unintended autoformatting changes to Python files Saving before reverting formatting-only changes. * feat: add back eos assert when not last token pooling * feat: removed duplicated code and cleanup * feat: removed not working architectures and unnecessary check * fix: typo * fix: dynamic pooling config * feat: added an example model for eurobert * feat: proper llama-vocab implementation for jina-v5 * fix: removed unnecessary comments
2026-03-17 16:44:07 +00:00 · 2026-02-26 12:14:09 +01:00
parent 1ca3d1de15
commit 66287bdaac
12 changed files with 214 additions and 4 deletions
--- a/tests/test-tokenizer-0.sh
+++ b/tests/test-tokenizer-0.sh
@@ -13,7 +13,12 @@ fi
 name=$1
 input=$2

-make -j tests/test-tokenizer-0
+# Build using CMake if binary doesn't exist
+if [ ! -f ./build/bin/test-tokenizer-0 ]; then
+    printf "Building test-tokenizer-0 with CMake...\n"
+    cmake -B build -DLLAMA_BUILD_TESTS=ON
+    cmake --build build --target test-tokenizer-0 -j
+fi

 printf "Testing %s on %s ...\n" $name $input

@@ -23,7 +28,7 @@ printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
 python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1

 printf "Tokenizing using (cpp) llama.cpp ...\n"
-./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
+./build/bin/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1

 cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
 cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"