From 61a66f25ab3cf287e93d2e25a4e1a2b1bedd58a9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 8 Oct 2024 14:24:22 +0300 Subject: [PATCH] llama : improve infill support ggml-ci --- common/arg.cpp | 248 ++++++++++----------- common/common.cpp | 18 +- common/common.h | 19 +- examples/infill/infill.cpp | 14 +- examples/server/README.md | 2 +- examples/server/server.cpp | 159 +++++++------- include/llama.h | 17 +- src/llama-vocab.cpp | 38 +++- src/llama-vocab.h | 35 ++- src/llama.cpp | 437 ++++++++++++++++++++++--------------- 10 files changed, 563 insertions(+), 424 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 7f5c05a347d33..dba0bd14472f8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -119,32 +119,6 @@ std::string llama_arg::to_string() { // utils // -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif - -LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - static void gpt_params_handle_model_default(gpt_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model @@ -199,7 +173,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx continue; } } catch (std::exception & e) { - throw std::invalid_argument(format( + throw std::invalid_argument(string_format( "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); } } @@ -220,7 +194,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx std::replace(arg.begin(), arg.end(), '_', '-'); } if (arg_to_options.find(arg) == arg_to_options.end()) { - throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); + throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } auto opt = *arg_to_options[arg]; if (opt.has_value_from_env()) { @@ -252,7 +226,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx continue; } } catch (std::exception & e) { - throw std::invalid_argument(format( + throw std::invalid_argument(string_format( "error while handling argument \"%s\": %s\n\n" "usage:\n%s\n\nto show complete usage, run with -h", arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); @@ -391,28 +365,28 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, )); add_opt(llama_arg( {"--verbose-prompt"}, - format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), [](gpt_params & params) { params.verbose_prompt = true; } )); add_opt(llama_arg( {"--no-display-prompt"}, - format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), [](gpt_params & params) { params.display_prompt = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-co", "--color"}, - format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), [](gpt_params & params) { params.use_color = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(llama_arg( {"-t", "--threads"}, "N", - format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), [](gpt_params & params, int value) { params.cpuparams.n_threads = value; if (params.cpuparams.n_threads <= 0) { @@ -472,14 +446,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, )); add_opt(llama_arg( {"--cpu-strict"}, "<0|1>", - format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), [](gpt_params & params, const std::string & value) { params.cpuparams.strict_cpu = std::stoul(value); } )); add_opt(llama_arg( {"--prio"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), [](gpt_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); @@ -489,7 +463,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, )); add_opt(llama_arg( {"--poll"}, "<0...100>", - format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), [](gpt_params & params, const std::string & value) { params.cpuparams.poll = std::stoul(value); } @@ -523,7 +497,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, )); add_opt(llama_arg( {"--prio-batch"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), + string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), [](gpt_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); @@ -567,7 +541,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"--prio-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), [](gpt_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); @@ -611,7 +585,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"--prio-batch-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), [](gpt_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); @@ -628,14 +602,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"--draft"}, "N", - format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), [](gpt_params & params, int value) { params.n_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(llama_arg( {"-ps", "--p-split"}, "N", - format("speculative decoding split probability (default: %.1f)", (double)params.p_split), + string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split), [](gpt_params & params, const std::string & value) { params.p_split = std::stof(value); } @@ -656,56 +630,56 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_LOOKUP})); add_opt(llama_arg( {"-c", "--ctx-size"}, "N", - format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), [](gpt_params & params, int value) { params.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(llama_arg( {"-n", "--predict", "--n-predict"}, "N", - format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), [](gpt_params & params, int value) { params.n_predict = value; } ).set_env("LLAMA_ARG_N_PREDICT")); add_opt(llama_arg( {"-b", "--batch-size"}, "N", - format("logical maximum batch size (default: %d)", params.n_batch), + string_format("logical maximum batch size (default: %d)", params.n_batch), [](gpt_params & params, int value) { params.n_batch = value; } ).set_env("LLAMA_ARG_BATCH")); add_opt(llama_arg( {"-ub", "--ubatch-size"}, "N", - format("physical maximum batch size (default: %d)", params.n_ubatch), + string_format("physical maximum batch size (default: %d)", params.n_ubatch), [](gpt_params & params, int value) { params.n_ubatch = value; } ).set_env("LLAMA_ARG_UBATCH")); add_opt(llama_arg( {"--keep"}, "N", - format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), [](gpt_params & params, int value) { params.n_keep = value; } )); add_opt(llama_arg( {"--no-context-shift"}, - format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), + string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), [](gpt_params & params) { params.ctx_shift = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(llama_arg( {"--chunks"}, "N", - format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), [](gpt_params & params, int value) { params.n_chunks = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); add_opt(llama_arg( {"-fa", "--flash-attn"}, - format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), [](gpt_params & params) { params.flash_attn = true; } @@ -721,7 +695,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, )); add_opt(llama_arg( {"--no-perf"}, - format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), [](gpt_params & params) { params.no_perf = true; params.sparams.no_perf = true; @@ -733,7 +707,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } // store the external file name in params params.prompt_file = value; @@ -749,7 +723,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } params.in_files.push_back(value); } @@ -760,7 +734,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } // store the external file name in params params.prompt_file = value; @@ -772,7 +746,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, )); add_opt(llama_arg( {"-e", "--escape"}, - format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), [](gpt_params & params) { params.escape = true; } @@ -786,7 +760,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, )); add_opt(llama_arg( {"-ptc", "--print-token-count"}, "N", - format("print token count every N tokens (default: %d)", params.n_print), + string_format("print token count every N tokens (default: %d)", params.n_print), [](gpt_params & params, int value) { params.n_print = value; } @@ -821,14 +795,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-sp", "--special"}, - format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), [](gpt_params & params) { params.special = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"-cnv", "--conversation"}, - format( + string_format( "run in conversation mode:\n" "- does not print special tokens and suffix/prefix\n" "- interactive mode is also enabled\n" @@ -841,14 +815,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-i", "--interactive"}, - format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), [](gpt_params & params) { params.interactive = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-if", "--interactive-first"}, - format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), [](gpt_params & params) { params.interactive_first = true; } @@ -893,7 +867,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--spm-infill"}, - format( + string_format( "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" ), @@ -903,7 +877,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"--samplers"}, "SAMPLERS", - format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), [](gpt_params & params, const std::string & value) { const auto sampler_names = string_split(value, ';'); params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); @@ -911,14 +885,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_sparam()); add_opt(llama_arg( {"-s", "--seed"}, "SEED", - format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), + string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), [](gpt_params & params, const std::string & value) { params.sparams.seed = std::stoul(value); } ).set_sparam()); add_opt(llama_arg( {"--sampling-seq"}, "SEQUENCE", - format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](gpt_params & params, const std::string & value) { params.sparams.samplers = gpt_sampler_types_from_chars(value); } @@ -932,14 +906,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_sparam()); add_opt(llama_arg( {"--penalize-nl"}, - format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), [](gpt_params & params) { params.sparams.penalize_nl = true; } ).set_sparam()); add_opt(llama_arg( {"--temp"}, "N", - format("temperature (default: %.1f)", (double)params.sparams.temp), + string_format("temperature (default: %.1f)", (double)params.sparams.temp), [](gpt_params & params, const std::string & value) { params.sparams.temp = std::stof(value); params.sparams.temp = std::max(params.sparams.temp, 0.0f); @@ -947,42 +921,42 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_sparam()); add_opt(llama_arg( {"--top-k"}, "N", - format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), [](gpt_params & params, int value) { params.sparams.top_k = value; } ).set_sparam()); add_opt(llama_arg( {"--top-p"}, "N", - format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), [](gpt_params & params, const std::string & value) { params.sparams.top_p = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--min-p"}, "N", - format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), [](gpt_params & params, const std::string & value) { params.sparams.min_p = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--tfs"}, "N", - format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), + string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), [](gpt_params & params, const std::string & value) { params.sparams.tfs_z = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--typical"}, "N", - format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), [](gpt_params & params, const std::string & value) { params.sparams.typ_p = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--repeat-last-n"}, "N", - format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), [](gpt_params & params, int value) { params.sparams.penalty_last_n = value; params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); @@ -990,42 +964,42 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_sparam()); add_opt(llama_arg( {"--repeat-penalty"}, "N", - format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), [](gpt_params & params, const std::string & value) { params.sparams.penalty_repeat = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--presence-penalty"}, "N", - format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), [](gpt_params & params, const std::string & value) { params.sparams.penalty_present = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--frequency-penalty"}, "N", - format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), [](gpt_params & params, const std::string & value) { params.sparams.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--dynatemp-range"}, "N", - format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), [](gpt_params & params, const std::string & value) { params.sparams.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--dynatemp-exp"}, "N", - format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), [](gpt_params & params, const std::string & value) { params.sparams.dynatemp_exponent = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--mirostat"}, "N", - format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), [](gpt_params & params, int value) { params.sparams.mirostat = value; @@ -1033,14 +1007,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_sparam()); add_opt(llama_arg( {"--mirostat-lr"}, "N", - format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), [](gpt_params & params, const std::string & value) { params.sparams.mirostat_eta = std::stof(value); } ).set_sparam()); add_opt(llama_arg( {"--mirostat-ent"}, "N", - format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), [](gpt_params & params, const std::string & value) { params.sparams.mirostat_tau = std::stof(value); } @@ -1069,7 +1043,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_sparam()); add_opt(llama_arg( {"--grammar"}, "GRAMMAR", - format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), [](gpt_params & params, const std::string & value) { params.sparams.grammar = value; } @@ -1080,7 +1054,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::copy( std::istreambuf_iterator(file), @@ -1150,49 +1124,49 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); add_opt(llama_arg( {"--yarn-orig-ctx"}, "N", - format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), [](gpt_params & params, int value) { params.yarn_orig_ctx = value; } ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); add_opt(llama_arg( {"--yarn-ext-factor"}, "N", - format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), [](gpt_params & params, const std::string & value) { params.yarn_ext_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); add_opt(llama_arg( {"--yarn-attn-factor"}, "N", - format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), [](gpt_params & params, const std::string & value) { params.yarn_attn_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); add_opt(llama_arg( {"--yarn-beta-slow"}, "N", - format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), [](gpt_params & params, const std::string & value) { params.yarn_beta_slow = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); add_opt(llama_arg( {"--yarn-beta-fast"}, "N", - format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), [](gpt_params & params, const std::string & value) { params.yarn_beta_fast = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_FAST")); add_opt(llama_arg( {"-gan", "--grp-attn-n"}, "N", - format("group-attention factor (default: %d)", params.grp_attn_n), + string_format("group-attention factor (default: %d)", params.grp_attn_n), [](gpt_params & params, int value) { params.grp_attn_n = value; } ).set_env("LLAMA_ARG_GRP_ATTN_N")); add_opt(llama_arg( {"-gaw", "--grp-attn-w"}, "N", - format("group-attention width (default: %.1f)", (double)params.grp_attn_w), + string_format("group-attention width (default: %.1f)", (double)params.grp_attn_w), [](gpt_params & params, int value) { params.grp_attn_w = value; } @@ -1213,7 +1187,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); add_opt(llama_arg( {"-ctk", "--cache-type-k"}, "TYPE", - format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), [](gpt_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_k = value; @@ -1221,7 +1195,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_env("LLAMA_ARG_CACHE_TYPE_K")); add_opt(llama_arg( {"-ctv", "--cache-type-v"}, "TYPE", - format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), [](gpt_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_v = value; @@ -1229,7 +1203,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_env("LLAMA_ARG_CACHE_TYPE_V")); add_opt(llama_arg( {"--perplexity", "--all-logits"}, - format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), + string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), [](gpt_params & params) { params.logits_all = true; } @@ -1243,7 +1217,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--hellaswag-tasks"}, "N", - format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), [](gpt_params & params, int value) { params.hellaswag_tasks = value; } @@ -1257,7 +1231,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--winogrande-tasks"}, "N", - format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), [](gpt_params & params, int value) { params.winogrande_tasks = value; } @@ -1271,7 +1245,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--multiple-choice-tasks"}, "N", - format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), [](gpt_params & params, int value) { params.multiple_choice_tasks = value; } @@ -1292,42 +1266,42 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--ppl-stride"}, "N", - format("stride for perplexity calculation (default: %d)", params.ppl_stride), + string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), [](gpt_params & params, int value) { params.ppl_stride = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--ppl-output-type"}, "<0|1>", - format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), [](gpt_params & params, int value) { params.ppl_output_type = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"-dt", "--defrag-thold"}, "N", - format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), [](gpt_params & params, const std::string & value) { params.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(llama_arg( {"-np", "--parallel"}, "N", - format("number of parallel sequences to decode (default: %d)", params.n_parallel), + string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), [](gpt_params & params, int value) { params.n_parallel = value; } ).set_env("LLAMA_ARG_N_PARALLEL")); add_opt(llama_arg( {"-ns", "--sequences"}, "N", - format("number of sequences to decode (default: %d)", params.n_sequences), + string_format("number of sequences to decode (default: %d)", params.n_sequences), [](gpt_params & params, int value) { params.n_sequences = value; } ).set_examples({LLAMA_EXAMPLE_PARALLEL})); add_opt(llama_arg( {"-cb", "--cont-batching"}, - format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), [](gpt_params & params) { params.cont_batching = true; } @@ -1451,7 +1425,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, std::vector split_arg{ it, {} }; if (split_arg.size() >= llama_max_devices()) { throw std::invalid_argument( - format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) ); } for (size_t i = 0; i < llama_max_devices(); ++i) { @@ -1468,7 +1442,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_env("LLAMA_ARG_TENSOR_SPLIT")); add_opt(llama_arg( {"-mg", "--main-gpu"}, "INDEX", - format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), [](gpt_params & params, int value) { params.main_gpu = value; if (!llama_supports_gpu_offload()) { @@ -1478,7 +1452,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_env("LLAMA_ARG_MAIN_GPU")); add_opt(llama_arg( {"--check-tensors"}, - format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), [](gpt_params & params) { params.check_tensors = true; } @@ -1489,7 +1463,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", [](gpt_params & params, const std::string & value) { if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { - throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); + throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str())); } } )); @@ -1543,7 +1517,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA ? std::string("model path from which to load base model") - : format( + : string_format( "model path (default: `models/$filename` with filename from `--hf-file` " "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), @@ -1592,42 +1566,42 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } params.context_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(llama_arg( {"--chunk-size"}, "N", - format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size), [](gpt_params & params, int value) { params.chunk_size = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(llama_arg( {"--chunk-separator"}, "STRING", - format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), [](gpt_params & params, const std::string & value) { params.chunk_separator = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(llama_arg( {"--junk"}, "N", - format("number of times to repeat the junk text (default: %d)", params.n_junk), + string_format("number of times to repeat the junk text (default: %d)", params.n_junk), [](gpt_params & params, int value) { params.n_junk = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(llama_arg( {"--pos"}, "N", - format("position of the passkey in the junk text (default: %d)", params.i_pos), + string_format("position of the passkey in the junk text (default: %d)", params.i_pos), [](gpt_params & params, int value) { params.i_pos = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(llama_arg( {"-o", "--output", "--output-file"}, "FNAME", - format("output file (default: '%s')", + string_format("output file (default: '%s')", ex == LLAMA_EXAMPLE_EXPORT_LORA ? params.lora_outfile.c_str() : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR @@ -1641,42 +1615,42 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(llama_arg( {"-ofreq", "--output-frequency"}, "N", - format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), [](gpt_params & params, int value) { params.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--save-frequency"}, "N", - format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), [](gpt_params & params, int value) { params.n_save_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--process-output"}, - format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), [](gpt_params & params) { params.process_output = true; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--no-ppl"}, - format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), [](gpt_params & params) { params.compute_ppl = false; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--chunk", "--from-chunk"}, "N", - format("start processing the input from chunk N (default: %d)", params.i_chunk), + string_format("start processing the input from chunk N (default: %d)", params.i_chunk), [](gpt_params & params, int value) { params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"-pps"}, - format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), [](gpt_params & params) { params.is_pp_shared = true; } @@ -1707,7 +1681,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(llama_arg( {"--embd-normalize"}, "N", - format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), [](gpt_params & params, int value) { params.embd_normalize = value; } @@ -1728,35 +1702,35 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(llama_arg( {"--host"}, "HOST", - format("ip address to listen (default: %s)", params.hostname.c_str()), + string_format("ip address to listen (default: %s)", params.hostname.c_str()), [](gpt_params & params, const std::string & value) { params.hostname = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); add_opt(llama_arg( {"--port"}, "PORT", - format("port to listen (default: %d)", params.port), + string_format("port to listen (default: %d)", params.port), [](gpt_params & params, int value) { params.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); add_opt(llama_arg( {"--path"}, "PATH", - format("path to serve static files from (default: %s)", params.public_path.c_str()), + string_format("path to serve static files from (default: %s)", params.public_path.c_str()), [](gpt_params & params, const std::string & value) { params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); add_opt(llama_arg( {"--embedding", "--embeddings"}, - format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), [](gpt_params & params) { params.embedding = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(llama_arg( {"--reranking", "--rerank"}, - format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), + string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), [](gpt_params & params) { params.reranking = true; } @@ -1774,7 +1748,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, const std::string & value) { std::ifstream key_file(value); if (!key_file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::string key; while (std::getline(key_file, key)) { @@ -1801,7 +1775,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); add_opt(llama_arg( {"-to", "--timeout"}, "N", - format("server read/write timeout in seconds (default: %d)", params.timeout_read), + string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), [](gpt_params & params, int value) { params.timeout_read = value; params.timeout_write = value; @@ -1809,7 +1783,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); add_opt(llama_arg( {"--threads-http"}, "N", - format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), [](gpt_params & params, int value) { params.n_threads_http = value; } @@ -1820,7 +1794,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::string system_prompt; std::copy( @@ -1833,21 +1807,21 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--metrics"}, - format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), [](gpt_params & params) { params.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); add_opt(llama_arg( {"--slots"}, - format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), [](gpt_params & params) { params.endpoint_slots = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); add_opt(llama_arg( {"--props"}, - format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), + string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), [](gpt_params & params) { params.endpoint_props = true; } @@ -1877,7 +1851,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", [](gpt_params & params, const std::string & value) { if (!llama_chat_verify_template(value)) { - throw std::runtime_error(format( + throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s\n" "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", value.c_str() @@ -1888,14 +1862,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); add_opt(llama_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", - format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), [](gpt_params & params, const std::string & value) { params.slot_prompt_similarity = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--lora-init-without-apply"}, - format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), [](gpt_params & params) { params.lora_init_without_apply = true; } @@ -1920,28 +1894,28 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, )); add_opt(llama_arg( {"--positive-file"}, "FNAME", - format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), + string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), [](gpt_params & params, const std::string & value) { params.cvector_positive_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--negative-file"}, "FNAME", - format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), [](gpt_params & params, const std::string & value) { params.cvector_negative_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--pca-batch"}, "N", - format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), [](gpt_params & params, int value) { params.n_pca_batch = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--pca-iter"}, "N", - format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), [](gpt_params & params, int value) { params.n_pca_iterations = value; } diff --git a/common/common.cpp b/common/common.cpp index 29df16c951ad0..1b4c5305fd90d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -23,10 +24,10 @@ #include #include #include +#include #include #include #include -#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -400,6 +401,21 @@ std::string gpt_params_get_system_info(const gpt_params & params) { // String utils // +std::string string_format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + std::vector string_split(std::string input, char separator) { std::vector parts; size_t separator_pos = input.find(separator); diff --git a/common/common.h b/common/common.h index 65add1f305546..6dd1f9d27559b 100644 --- a/common/common.h +++ b/common/common.h @@ -352,15 +352,28 @@ void gpt_init(); std::string gpt_params_get_system_info(const gpt_params & params); -bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); -bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); -void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); +bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); +bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]); +void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr); bool set_process_priority(enum ggml_sched_priority prio); // // String utils // +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +std::string string_format(const char * fmt, ...); + std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index d52425ae61ef3..c1756b8bbd795 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -205,11 +205,11 @@ int main(int argc, char ** argv) { std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); - GGML_ASSERT(llama_token_prefix(model) >= 0); - GGML_ASSERT(llama_token_suffix(model) >= 0); + GGML_ASSERT(llama_token_fim_pre(model) >= 0); + GGML_ASSERT(llama_token_fim_suf(model) >= 0); - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; @@ -218,7 +218,7 @@ int main(int argc, char ** argv) { } embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - const llama_token middle_token = llama_token_middle(model); + const llama_token middle_token = llama_token_fim_mid(model); if (middle_token >= 0) { embd_inp.push_back(middle_token); } @@ -508,8 +508,8 @@ int main(int argc, char ** argv) { std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; diff --git a/examples/server/README.md b/examples/server/README.md index 09d1cf0974988..3da0130aca24d 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -526,7 +526,7 @@ Takes a prefix and a suffix and returns the predicted completion as stream. - `input_prefix`: Set the prefix of the code to infill. - `input_suffix`: Set the suffix of the code to infill. -It also accepts all the options of `/completion` except `stream` and `prompt`. +It also accepts all the options of `/completion`. ### **GET** `/props`: Get server global properties. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index aedfca0d6ea1c..b3773d256ef43 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -753,12 +753,7 @@ struct server_context { metrics.init(); } - std::vector tokenize(const json & json_prompt, bool add_special) const { - // TODO: currently, we tokenize using special tokens by default - // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) - // but it's better compared to completely ignoring ChatML and other chat templates - const bool TMP_FORCE_SPECIAL = true; - + std::vector tokenize(const json & json_prompt, bool add_special, bool parse_special) const { // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. std::vector prompt_tokens; @@ -771,10 +766,10 @@ struct server_context { std::vector p; if (first) { - p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + p = ::llama_tokenize(ctx, s, add_special, parse_special); first = false; } else { - p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); + p = ::llama_tokenize(ctx, s, false, parse_special); } prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); @@ -788,7 +783,7 @@ struct server_context { } } else { auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + prompt_tokens = ::llama_tokenize(ctx, s, add_special, parse_special); } return prompt_tokens; @@ -1215,7 +1210,7 @@ struct server_context { slot.params.n_predict, n_ctx_train); } - SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str()); + SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str()); return slot.has_next_token; // continue } @@ -1483,9 +1478,8 @@ struct server_context { if (prompt.is_string() || json_is_array_of_numbers(prompt)) { data["index"] = 0; create_task(data, false, nullptr); - } - // otherwise, it's a multiple-prompt task, we break it into smaller tasks - else if (prompt.is_array()) { + } else if (prompt.is_array()) { + // otherwise, it's a multiple-prompt task, we break it into smaller tasks std::vector prompts = prompt; if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { // prompts[0] is the question @@ -1510,9 +1504,8 @@ struct server_context { } } } - } - // invalid case - else { + } else { + // invalid case throw std::runtime_error(error_msg); } @@ -1971,63 +1964,57 @@ struct server_context { slot.t_start_process_prompt = ggml_time_us(); slot.t_start_generation = 0; - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) { - const bool add_bos = llama_add_bos_token(model); - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } - - auto prefix_tokens = tokenize(slot.params.input_prefix, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false); - - const int space_token = 29871; // TODO: this should not be hardcoded - if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } - - prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); - - auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; - auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; - if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); - } - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - - const llama_token middle_token = llama_token_middle(model); - if (middle_token >= 0) { - embd_inp.push_back(middle_token); - } - - prompt_tokens = embd_inp; - } else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // require slot.prompt to be array of 2 strings - if (!slot.prompt.is_array() || slot.prompt.size() != 2) { - SLT_ERR(slot, "%s", "invalid prompt for rerank task\n"); - slot.release(); - send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST); - continue; - } - - // prompt: [BOS]query[EOS][SEP]doc[EOS] - prompt_tokens.clear(); - prompt_tokens.push_back(llama_token_bos(model)); - { - const auto part = tokenize(slot.prompt[0], false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - prompt_tokens.push_back(llama_token_sep(model)); - { - const auto part = tokenize(slot.prompt[1], false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - } else { - prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt + switch (slot.cmpl_type) { + case SERVER_TASK_CMPL_TYPE_NORMAL: + case SERVER_TASK_CMPL_TYPE_EMBEDDING: + { + prompt_tokens = tokenize(slot.prompt, system_prompt.empty(), true); // add BOS if there isn't system prompt + } break; + case SERVER_TASK_CMPL_TYPE_RERANK: + { + // require slot.prompt to be array of 2 strings + if (!slot.prompt.is_array() || slot.prompt.size() != 2) { + SLT_ERR(slot, "%s", "invalid prompt for rerank task\n"); + slot.release(); + send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST); + continue; + } + + // prompt: [BOS]query[EOS][SEP]doc[EOS] + prompt_tokens.clear(); + prompt_tokens.push_back(llama_token_bos(model)); + { + const auto part = tokenize(slot.prompt[0], false, false); + prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); + } + prompt_tokens.push_back(llama_token_eos(model)); + prompt_tokens.push_back(llama_token_sep(model)); + { + const auto part = tokenize(slot.prompt[1], false, false); + prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); + } + prompt_tokens.push_back(llama_token_eos(model)); + } break; + case SERVER_TASK_CMPL_TYPE_INFILL: + { + auto prefix_tokens = tokenize(slot.params.input_prefix, false, false); + auto suffix_tokens = tokenize(slot.params.input_suffix, false, false); + + prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model)); + suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model)); + + auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; + auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + + if (llama_add_bos_token(model)) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + embd_inp.push_back(llama_token_fim_mid(model)); + + prompt_tokens = std::move(embd_inp); + } break; } slot.n_past = 0; @@ -2035,6 +2022,11 @@ struct server_context { SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); + // print prompt tokens: + for (int i = 0; i < (int) prompt_tokens.size(); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], llama_token_to_piece(ctx, prompt_tokens[i]).c_str()); + } + // empty prompt passed -> release the slot and send empty response if (prompt_tokens.empty()) { SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); @@ -2924,7 +2916,23 @@ int main(int argc, char ** argv) { return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res); }; - const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + std::string err; + if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "prefix token is missing. "; + } + if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "suffix token is missing. "; + } + if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "middle token is missing. "; + } + + if (!err.empty()) { + res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); + return; + } + json data = json::parse(req.body); return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res); }; @@ -3010,7 +3018,8 @@ int main(int argc, char ** argv) { if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); const bool with_pieces = json_value(body, "with_pieces", false); - std::vector tokens = ctx_server.tokenize(body.at("content"), add_special); + + std::vector tokens = ctx_server.tokenize(body.at("content"), add_special, true); if (with_pieces) { for (const auto& token : tokens) { diff --git a/include/llama.h b/include/llama.h index 7cae1bbe2e5b8..65d50b41977df 100644 --- a/include/llama.h +++ b/include/llama.h @@ -896,6 +896,7 @@ extern "C" { // Special tokens LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence + LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line @@ -904,11 +905,17 @@ extern "C" { LLAMA_API bool llama_add_bos_token(const struct llama_model * model); LLAMA_API bool llama_add_eos_token(const struct llama_model * model); - // Codellama infill tokens - LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix - LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle - LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix - LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle + // infill tokens + DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead"); + DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead"); + DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead"); + + LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model); // // Tokenization diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d2f34ddd6b339..a27394a377231 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1663,6 +1663,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) { return vocab.special_eos_id; } +llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { + return vocab.special_eot_id; +} + +llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { + return vocab.special_eom_id; +} + llama_token llama_token_cls_impl(const struct llama_vocab & vocab) { return vocab.special_cls_id; } @@ -1688,23 +1696,39 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) { } llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) { - return vocab.special_prefix_id; + return vocab.special_fim_pre_id; } llama_token llama_token_middle_impl(const struct llama_vocab & vocab) { - return vocab.special_middle_id; + return vocab.special_fim_mid_id; } llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) { - return vocab.special_suffix_id; + return vocab.special_fim_suf_id; } -llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { - return vocab.special_eot_id; +llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_pre_id; } -llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { - return vocab.special_eom_id; +llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_suf_id; +} + +llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_mid_id; +} + +llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_pad_id; +} + +llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_rep_id; +} + +llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_sep_id; } int32_t llama_tokenize_impl( diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 28bad9135bbf0..17e14488a4d52 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -37,20 +37,26 @@ struct llama_vocab { std::map, int> bpe_ranks; // default LLaMA special tokens + // TODO: should we set all of these to LLAMA_TOKEN_NULL? id special_bos_id = 1; id special_eos_id = 2; + id special_eot_id = LLAMA_TOKEN_NULL; + id special_eom_id = LLAMA_TOKEN_NULL; id special_unk_id = 0; id special_sep_id = LLAMA_TOKEN_NULL; id special_pad_id = LLAMA_TOKEN_NULL; id special_cls_id = LLAMA_TOKEN_NULL; id special_mask_id = LLAMA_TOKEN_NULL; - id linefeed_id = 13; - id special_prefix_id = LLAMA_TOKEN_NULL; - id special_suffix_id = LLAMA_TOKEN_NULL; - id special_middle_id = LLAMA_TOKEN_NULL; - id special_eot_id = LLAMA_TOKEN_NULL; // TODO: move above after "eos_id", and here add "file separator" token - id special_eom_id = LLAMA_TOKEN_NULL; + id linefeed_id = 13; + + // fim tokens + id special_fim_pre_id = LLAMA_TOKEN_NULL; + id special_fim_suf_id = LLAMA_TOKEN_NULL; + id special_fim_mid_id = LLAMA_TOKEN_NULL; + id special_fim_pad_id = LLAMA_TOKEN_NULL; + id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo + id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator // set of all tokens that cause "end of generation" std::set special_eog_ids; @@ -104,19 +110,26 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t llama_token llama_token_bos_impl(const struct llama_vocab & vocab); llama_token llama_token_eos_impl(const struct llama_vocab & vocab); +llama_token llama_token_eot_impl(const struct llama_vocab & vocab); +llama_token llama_token_eom_impl(const struct llama_vocab & vocab); llama_token llama_token_cls_impl(const struct llama_vocab & vocab); llama_token llama_token_sep_impl(const struct llama_vocab & vocab); llama_token llama_token_nl_impl (const struct llama_vocab & vocab); llama_token llama_token_pad_impl(const struct llama_vocab & vocab); -bool llama_add_bos_token_impl(const struct llama_vocab & vocab); -bool llama_add_eos_token_impl(const struct llama_vocab & vocab); - llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); -llama_token llama_token_eot_impl (const struct llama_vocab & vocab); -llama_token llama_token_eom_impl (const struct llama_vocab & vocab); + +llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab); + +bool llama_add_bos_token_impl(const struct llama_vocab & vocab); +bool llama_add_eos_token_impl(const struct llama_vocab & vocab); int32_t llama_tokenize_impl( const struct llama_vocab & vocab, diff --git a/src/llama.cpp b/src/llama.cpp index 01cdf17dcb91b..78f0670f5a36f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -349,6 +349,8 @@ enum llm_kv { LLM_KV_TOKENIZER_MERGES, LLM_KV_TOKENIZER_BOS_ID, LLM_KV_TOKENIZER_EOS_ID, + LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_TOKENIZER_UNK_ID, LLM_KV_TOKENIZER_SEP_ID, LLM_KV_TOKENIZER_PAD_ID, @@ -361,11 +363,12 @@ enum llm_kv { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, - LLM_KV_TOKENIZER_PREFIX_ID, - LLM_KV_TOKENIZER_SUFFIX_ID, - LLM_KV_TOKENIZER_MIDDLE_ID, - LLM_KV_TOKENIZER_EOT_ID, - LLM_KV_TOKENIZER_EOM_ID, + LLM_KV_TOKENIZER_FIM_PRE_ID, + LLM_KV_TOKENIZER_FIM_SUF_ID, + LLM_KV_TOKENIZER_FIM_MID_ID, + LLM_KV_TOKENIZER_FIM_PAD_ID, + LLM_KV_TOKENIZER_FIM_REP_ID, + LLM_KV_TOKENIZER_FIM_SEP_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, @@ -426,57 +429,60 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, - { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, - { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, - { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, - { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, - { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, - { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, - - { LLM_KV_SPLIT_NO, "split.no" }, - { LLM_KV_SPLIT_COUNT, "split.count" }, - { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, - - { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, - { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, - { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, - { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, - { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, - - { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, - - { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, - { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, - { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, - { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, - { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, - { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, - { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, - { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, - { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, - { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, - { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, - { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, - { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, - { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, - { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, - { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, - { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, - { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, - { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, - { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, - { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, - { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, - { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, - { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, - - { LLM_KV_ADAPTER_TYPE, "adapter.type" }, - { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, + + { LLM_KV_SPLIT_NO, "split.no" }, + { LLM_KV_SPLIT_COUNT, "split.count" }, + { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, + + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, + { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, + { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, + { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, + { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + + { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, + + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, + { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, + { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, + { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, + { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, + { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, + { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, + { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, + { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, + { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, + { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, + { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, + { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, + { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, + { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, + { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, + { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, + { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, + { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, + { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, + { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, + { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, + + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, }; struct LLM_KV { @@ -6193,14 +6199,14 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_NONE; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; - vocab.linefeed_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; + vocab.linefeed_id = LLAMA_TOKEN_NULL; // read vocab size from metadata if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) { @@ -6217,16 +6223,16 @@ static void llm_load_vocab( vocab.special_bos_id = 1; vocab.special_eos_id = 2; vocab.special_unk_id = 0; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; } else if (tokenizer_model == "bert") { vocab.type = LLAMA_VOCAB_TYPE_WPM; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; vocab.special_unk_id = 100; vocab.special_sep_id = 102; vocab.special_pad_id = 0; @@ -6262,22 +6268,22 @@ static void llm_load_vocab( // default special tokens vocab.special_bos_id = 11; vocab.special_eos_id = 11; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; } else if (tokenizer_model == "t5") { vocab.type = LLAMA_VOCAB_TYPE_UGM; // default special tokens - vocab.special_bos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; vocab.special_eos_id = 1; vocab.special_unk_id = 2; - vocab.special_sep_id = -1; + vocab.special_sep_id = LLAMA_TOKEN_NULL; vocab.special_pad_id = 0; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); if (precompiled_charsmap_keyidx != -1) { @@ -6300,11 +6306,11 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_RWKV; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; } else { throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); } @@ -6388,7 +6394,7 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "chatglm-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; - vocab.special_bos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; } else if ( tokenizer_pre == "viking") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; @@ -6514,44 +6520,6 @@ static void llm_load_vocab( // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - // For Fill-In-the-Middle (FIM)/infill models which where converted - // prior to support of FIM special tokens in GGUF, the following - // will allow those models to continue to work. The general names - // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and - // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once - // new versions of these models have been published. - std::string gen_name; - ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false); - - std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(), - [](unsigned char c){ return std::tolower(c); }); - - if (gen_name.find("code") != std::string::npos) { - if (model.arch == LLM_ARCH_LLAMA - && 32010 < vocab.id_to_token.size() - && vocab.id_to_token[32007].text.find("
") != std::string::npos
-              && vocab.id_to_token[32008].text.find("") != std::string::npos
-              && vocab.id_to_token[32009].text.find("") != std::string::npos
-              && vocab.id_to_token[32010].text.find("") != std::string::npos) {
-                vocab.special_prefix_id = 32007;
-                vocab.special_suffix_id = 32008;
-                vocab.special_middle_id = 32009;
-                vocab.special_eot_id    = 32010;
-            } else if (model.arch == LLM_ARCH_GEMMA
-              && 107 < vocab.id_to_token.size()
-              && vocab.id_to_token[67].text == "<|fim_prefix|>"
-              && vocab.id_to_token[69].text == "<|fim_suffix|>"
-              && vocab.id_to_token[68].text == "<|fim_middle|>"
-              && vocab.id_to_token[107].text == "") {
-                vocab.special_prefix_id = 67;
-                vocab.special_suffix_id = 69;
-                vocab.special_middle_id = 68;
-                // TODO: this is not EOT, it is "file separator" token, needs fix
-                //       https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
-                //vocab.special_eot_id    = 70;
-                vocab.special_eot_id    = 107;
-            }
-        }
         try {
             vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
         } catch (const std::exception & e) {
@@ -6579,18 +6547,21 @@ static void llm_load_vocab(
     // special tokens
     {
         const std::vector> special_token_types = {
-            { LLM_KV_TOKENIZER_BOS_ID,    vocab.special_bos_id    },
-            { LLM_KV_TOKENIZER_EOS_ID,    vocab.special_eos_id    },
-            { LLM_KV_TOKENIZER_UNK_ID,    vocab.special_unk_id    },
-            { LLM_KV_TOKENIZER_SEP_ID,    vocab.special_sep_id    },
-            { LLM_KV_TOKENIZER_PAD_ID,    vocab.special_pad_id    },
-            { LLM_KV_TOKENIZER_CLS_ID,    vocab.special_cls_id    },
-            { LLM_KV_TOKENIZER_MASK_ID,   vocab.special_mask_id   },
-            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
-            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
-            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
-            { LLM_KV_TOKENIZER_EOT_ID,    vocab.special_eot_id    },
-            { LLM_KV_TOKENIZER_EOM_ID,    vocab.special_eom_id    },
+            { LLM_KV_TOKENIZER_BOS_ID,     vocab.special_bos_id     },
+            { LLM_KV_TOKENIZER_EOS_ID,     vocab.special_eos_id     },
+            { LLM_KV_TOKENIZER_EOT_ID,     vocab.special_eot_id     },
+            { LLM_KV_TOKENIZER_EOM_ID,     vocab.special_eom_id     },
+            { LLM_KV_TOKENIZER_UNK_ID,     vocab.special_unk_id     },
+            { LLM_KV_TOKENIZER_SEP_ID,     vocab.special_sep_id     },
+            { LLM_KV_TOKENIZER_PAD_ID,     vocab.special_pad_id     },
+            { LLM_KV_TOKENIZER_CLS_ID,     vocab.special_cls_id     },
+            { LLM_KV_TOKENIZER_MASK_ID,    vocab.special_mask_id    },
+            { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
+            { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
+            { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
+            { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
+            { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
+            { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
         };
 
         for (const auto & it : special_token_types) {
@@ -6621,16 +6592,14 @@ static void llm_load_vocab(
             }
         }
 
-        // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc.
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
-        //       for now, we apply this workaround to find the EOT token based on its text
-        if (vocab.special_eot_id == -1) {
-            for (const auto & t : vocab.token_to_id) {
+        // auto-detect special tokens by text
+        // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+        //       for now, we apply this workaround to find the tokens based on their text
+
+        for (const auto & t : vocab.token_to_id) {
+            // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc.
+            if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
                 if (false
-                        // TODO: gemma "" is exported as a normal token, so the following check does not work
-                        //       need to fix convert script
-                        //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
                         || t.first == "<|eot_id|>"
                         || t.first == "<|im_end|>"
                         || t.first == "<|end|>"
@@ -6644,23 +6613,109 @@ static void llm_load_vocab(
                                 __func__, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
-                    break;
                 }
             }
-        }
 
-        // find EOM token: "<|eom_id|>"
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
-        //       for now, we apply this workaround to find the EOM token based on its text
-        if (vocab.special_eom_id == -1) {
-            const auto & t = vocab.token_to_id.find("<|eom_id|>");
-            if (t != vocab.token_to_id.end()) {
-                vocab.special_eom_id = t->second;
-                if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                        __func__, t->first.c_str());
-                    vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+            // find EOM token: "<|eom_id|>"
+            if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|eom_id|>"
+                   ) {
+                    vocab.special_eom_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PRE token: "<|fim_prefix|>", "", "
", etc.
+            if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_prefix|>"
+                        || t.first == ""
+                        || t.first == "
") {
+                    vocab.special_fim_pre_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SUF token: "<|fim_suffix|>", "", "", etc.
+            if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_suffix|>"
+                        || t.first == ""
+                        || t.first == "") {
+                    vocab.special_fim_suf_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_MID token: "<|fim_middle|>", "", "", etc.
+            if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_middle|>"
+                        || t.first == ""
+                        || t.first == "") {
+                    vocab.special_fim_mid_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PAD token: "<|fim_pad|>", "", "", etc.
+            if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_pad|>"
+                        || t.first == ""
+                        || t.first == "") {
+                    vocab.special_fim_pad_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_REP token: "<|fim_repo|>", "", "", etc.
+            if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_repo|>"
+                        || t.first == "<|repo_name|>"
+                        || t.first == ""
+                        || t.first == "") {
+                    vocab.special_fim_rep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SEP token: "<|file_sep|>"
+            if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|file_sep|>") {
+                    vocab.special_fim_sep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
                 }
             }
         }
@@ -6688,17 +6743,17 @@ static void llm_load_vocab(
             }
         }
 
-        if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
+        if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eos_id);
             LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
+        if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eot_id);
             LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
+        if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eom_id);
             LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
@@ -6892,20 +6947,24 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
 
     // special tokens
-    if (vocab.special_bos_id    != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,  vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
-    if (vocab.special_eos_id    != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,  vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
-    if (vocab.special_unk_id    != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,  vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
-    if (vocab.special_sep_id    != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,  vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
-    if (vocab.special_pad_id    != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,  vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
-    if (vocab.special_cls_id    != -1) { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,  vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
-    if (vocab.special_mask_id   != -1) { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
-
-    if (vocab.linefeed_id       != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,       vocab.id_to_token[vocab.linefeed_id].text.c_str() );       }
-    if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token        = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
-    if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token        = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
-    if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token        = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
-    if (vocab.special_eot_id    != -1) { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,    vocab.id_to_token[vocab.special_eot_id].text.c_str() );    }
-    if (vocab.special_eom_id    != -1) { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,    vocab.id_to_token[vocab.special_eom_id].text.c_str() );    }
+    if (vocab.special_bos_id  != -1)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,     vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
+    if (vocab.special_eos_id  != -1)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,     vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
+    if (vocab.special_eot_id  != -1)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,     vocab.id_to_token[vocab.special_eot_id].text.c_str() );  }
+    if (vocab.special_eom_id  != -1)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,     vocab.id_to_token[vocab.special_eom_id].text.c_str() );  }
+    if (vocab.special_unk_id  != -1)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,     vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
+    if (vocab.special_sep_id  != -1)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,     vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
+    if (vocab.special_pad_id  != -1)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,     vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
+    if (vocab.special_cls_id  != -1)    { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,     vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
+    if (vocab.special_mask_id != -1)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id,    vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
+
+    if (vocab.linefeed_id != -1)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,        vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+
+    if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
+    if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
+    if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
+    if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
+    if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
+    if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
 
     for (const auto & id : vocab.special_eog_ids) {
         LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
@@ -21318,6 +21377,10 @@ llama_token llama_token_eos(const struct llama_model * model) {
     return llama_token_eos_impl(model->vocab);
 }
 
+llama_token llama_token_eot(const struct llama_model * model) {
+    return llama_token_eot_impl(model->vocab);
+}
+
 llama_token llama_token_cls(const struct llama_model * model) {
     return llama_token_cls_impl(model->vocab);
 }
@@ -21354,8 +21417,28 @@ llama_token llama_token_suffix(const struct llama_model * model) {
     return llama_token_suffix_impl(model->vocab);
 }
 
-llama_token llama_token_eot(const struct llama_model * model) {
-    return llama_token_eot_impl(model->vocab);
+llama_token llama_token_fim_pre(const struct llama_model * model) {
+    return llama_token_fim_pre_impl(model->vocab);
+}
+
+llama_token llama_token_fim_suf(const struct llama_model * model) {
+    return llama_token_fim_suf_impl(model->vocab);
+}
+
+llama_token llama_token_fim_mid(const struct llama_model * model) {
+    return llama_token_fim_mid_impl(model->vocab);
+}
+
+llama_token llama_token_fim_pad(const struct llama_model * model) {
+    return llama_token_fim_pad_impl(model->vocab);
+}
+
+llama_token llama_token_fim_rep(const struct llama_model * model) {
+    return llama_token_fim_rep_impl(model->vocab);
+}
+
+llama_token llama_token_fim_sep(const struct llama_model * model) {
+    return llama_token_fim_sep_impl(model->vocab);
 }
 
 //