From 567cf438d41b1bc2643195b7440e5da3763f3f20 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 30 Apr 2024 11:05:25 +0300 Subject: [PATCH] convert : use utf8 encoding (#7000) * convert : use utf8 encoding * convert : update instructions and warning message --- convert-hf-to-gguf-update.py | 16 ++++++++++------ convert-hf-to-gguf.py | 12 ++++++++---- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 1c559c3f693be0..b019c1e3dc59fd 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -128,7 +128,7 @@ def download_file_with_auth(url, token, save_path): print(f"chkhsh: {chkhsh}") # print the "pre_tokenizer" content from the tokenizer.json - with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f: + with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f: cfg = json.load(f) pre_tokenizer = cfg["pre_tokenizer"] print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) @@ -156,15 +156,19 @@ def download_file_with_auth(url, token, save_path): src_func += "\n" src_func += " res = None\n" src_func += "\n" -src_func += " # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n" -src_func += " # don't do this manually - use the convert-hf-to-gguf-update.py script!\n" +src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n" +src_func += " # or pull the latest version of the model from Huggingface\n" +src_func += " # don't edit the hashes manually!\n" src_func += f"{src_ifs}\n" src_func += " if res is None:\n" src_func += " print(\"\\n\")\n" src_func += " print(\"**************************************************************************************\")\n" src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n" -src_func += " print(\"** This means that it was not added yet or you are using an older version.\")\n" -src_func += " print(\"** Check convert-hf-to-gguf-update.py and update it accordingly.\")\n" +src_func += " print(\"** There are 2 possible reasons for this:\")\n" +src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n" +src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n" +src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n" +src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n" src_func += " print(\"**\")\n" src_func += " print(f\"** chkhsh: {chkhsh}\")\n" src_func += " print(\"**************************************************************************************\")\n" @@ -249,7 +253,7 @@ def download_file_with_auth(url, token, save_path): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") - with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f: + with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f: for text in tests: f.write(f"{text}") f.write("\n__ggml_vocab_test__\n") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 3cb6649c5e7e53..df562d88aab1e1 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -279,8 +279,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = None - # NOTE: if you get an error here, you need to add the model to the if-elif chain below - # don't do this manually - use the convert-hf-to-gguf-update.py script! + # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-bpe" @@ -310,8 +311,11 @@ def get_vocab_base_pre(self, tokenizer) -> str: print("\n") print("**************************************************************************************") print("** WARNING: The BPE pre-tokenizer was not recognized!") - print("** This means that it was not added yet or you are using an older version.") - print("** Check convert-hf-to-gguf-update.py and update it accordingly.") + print("** There are 2 possible reasons for this:") + print("** - the model has not been added to convert-hf-to-gguf-update.py yet") + print("** - the pre-tokenization config has changed upstream") + print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") print("**") print(f"** chkhsh: {chkhsh}") print("**************************************************************************************")