convert : fix encoding of WPM vocab for BERT models (#18500)

* convert: avoid token collision when stripping ## prefix * convert: use token types for BERT special tokens check * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-05-01 22:54:05 +00:00 · 2026-01-02 01:27:07 +08:00
parent f4f5019254
commit 2b2afade9f
1 changed files with 4 additions and 3 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5287,13 +5287,14 @@ class BertModel(TextModel):
        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))

        # convert to phantom space vocab
-        def phantom(tok):
-            if tok.startswith("[") and tok.endswith("]"):
+        def phantom(tok, toktype):
+            if toktype == gguf.TokenType.CONTROL:
                return tok
            if tok.startswith("##"):
                return tok[2:]
            return "\u2581" + tok
-        tokens = list(map(phantom, tokens))
+        assert len(tokens) == len(toktypes)
+        tokens = list(map(phantom, tokens, toktypes))

        # add vocab to gguf
        self.gguf_writer.add_tokenizer_model("bert")