mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-01 22:54:05 +00:00
convert : fix encoding of WPM vocab for BERT models (#18500)
* convert: avoid token collision when stripping ## prefix * convert: use token types for BERT special tokens check * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
@@ -5287,13 +5287,14 @@ class BertModel(TextModel):
|
||||
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
||||
|
||||
# convert to phantom space vocab
|
||||
def phantom(tok):
|
||||
if tok.startswith("[") and tok.endswith("]"):
|
||||
def phantom(tok, toktype):
|
||||
if toktype == gguf.TokenType.CONTROL:
|
||||
return tok
|
||||
if tok.startswith("##"):
|
||||
return tok[2:]
|
||||
return "\u2581" + tok
|
||||
tokens = list(map(phantom, tokens))
|
||||
assert len(tokens) == len(toktypes)
|
||||
tokens = list(map(phantom, tokens, toktypes))
|
||||
|
||||
# add vocab to gguf
|
||||
self.gguf_writer.add_tokenizer_model("bert")
|
||||
|
||||
Reference in New Issue
Block a user