diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index a26f45a5fb1d09..6be75a4269602c 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -257,6 +257,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: "3333333", "33333333", "333333333", + #"Cửa Việt", # llama-bpe fails on this chktxt, ] diff --git a/unicode.cpp b/unicode.cpp index 1e3e5466f4074c..ca03c49d39c7cb 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -112,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) static std::unordered_map unicode_cpt_type_map() { std::unordered_map cpt_types; for (auto p : unicode_ranges_number) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_NUMBER; } } for (auto p : unicode_ranges_letter) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_LETTER; } } for (auto p : unicode_ranges_separator) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_SEPARATOR; } } for (auto p : unicode_ranges_accent_mark) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK; } } for (auto p : unicode_ranges_punctuation) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION; } } @@ -142,7 +142,7 @@ static std::unordered_map unicode_cpt_type_map() { } } for (auto p : unicode_ranges_control) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_CONTROL; } } @@ -629,7 +629,7 @@ bool unicode_cpt_is_whitespace(uint32_t cp) { static const std::unordered_set is_whitespace = [] { std::unordered_set is_whitespace; for (auto p : unicode_ranges_whitespace) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { is_whitespace.insert(i); } }