Skip to content

Commit

Permalink
Merge branch 'master' into ik/restore_k-quants_for_MoE
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov authored Jan 11, 2024
2 parents 6e60a5c + 49662cb commit 31fb4d8
Show file tree
Hide file tree
Showing 19 changed files with 1,153 additions and 103 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/nix-flake-update.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ jobs:
pr-labels: |
nix
pr-reviewers: philiptaron,SomeoneSerge
token: ${{ secrets.GITHUB_TOKEN }}
token: ${{ secrets.FLAKE_TOKEN }}
2 changes: 1 addition & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ let package = Package(
.library(name: "llama", targets: ["llama"]),
],
dependencies: [
.package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
.package(url: "https://github.com/ggerganov/ggml.git", .revision("979cc23b345006504cfc1f67c0fdf627805e3319"))
],
targets: [
.target(
Expand Down
8 changes: 8 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.ppl_stride = std::stoi(argv[i]);
} else if (arg == "-stc" || arg == "--show_token_count") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.token_interval = std::stoi(argv[i]);
} else if (arg == "--ppl-output-type") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -944,6 +950,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -stc N --show_token_count N\n");
printf(" show consumed tokens every N tokens\n");
printf("\n");
#ifndef LOG_DISABLE_LOGS
log_print_usage();
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ struct gpt_params {
int32_t n_beams = 0; // if non-zero then use beam search of given width.
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t token_interval = 512; // show token count every 512 tokens
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
Expand Down Expand Up @@ -242,4 +243,3 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

6 changes: 5 additions & 1 deletion examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ int main(int argc, char ** argv) {
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (!embd.empty()) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;

Expand Down Expand Up @@ -650,6 +650,10 @@ int main(int argc, char ** argv) {
n_past += n_eval;

LOG("n_past = %d\n", n_past);
// Display total tokens alongside total time
if (n_past % params.token_interval == 0) {
printf("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
}
}

if (!embd.empty() && !path_session.empty()) {
Expand Down
7 changes: 6 additions & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ Command line options:
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
- `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default examples/server/public)
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token.
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
- `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
Expand Down Expand Up @@ -110,6 +111,10 @@ node index.js
```

## API Endpoints
- **GET** `/health`: Returns the current state of the server:
- `{"status": "loading model"}` if the model is still being loaded.
- `{"status": "error"}` if the model failed to load.
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.

- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.

Expand Down
101 changes: 70 additions & 31 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ using json = nlohmann::json;
struct server_params
{
std::string hostname = "127.0.0.1";
std::string api_key;
std::vector<std::string> api_keys;
std::string public_path = "examples/server/public";
int32_t port = 8080;
int32_t read_timeout = 600;
Expand Down Expand Up @@ -147,15 +147,15 @@ static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
// parallel
//

enum ServerState {
LOADING_MODEL, // Server is starting up, model not fully loaded yet
READY, // Server is ready and model is loaded
ERROR // An error occurred, load_model failed
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
SERVER_STATE_ERROR // An error occurred, load_model failed
};

enum task_type {
COMPLETION_TASK,
CANCEL_TASK
TASK_TYPE_COMPLETION,
TASK_TYPE_CANCEL,
};

struct task_server {
Expand Down Expand Up @@ -1402,7 +1402,7 @@ struct llama_server_context
task.data = std::move(data);
task.infill_mode = infill;
task.embedding_mode = embedding;
task.type = COMPLETION_TASK;
task.type = TASK_TYPE_COMPLETION;
task.multitask_id = multitask_id;

// when a completion task's prompt array is not a singleton, we split it into multiple requests
Expand Down Expand Up @@ -1524,7 +1524,7 @@ struct llama_server_context
std::unique_lock<std::mutex> lock(mutex_tasks);
task_server task;
task.id = id_gen++;
task.type = CANCEL_TASK;
task.type = TASK_TYPE_CANCEL;
task.target_id = task_id;
queue_tasks.push_back(task);
condition_tasks.notify_one();
Expand Down Expand Up @@ -1560,7 +1560,7 @@ struct llama_server_context
queue_tasks.erase(queue_tasks.begin());
switch (task.type)
{
case COMPLETION_TASK: {
case TASK_TYPE_COMPLETION: {
llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
if (slot == nullptr)
{
Expand Down Expand Up @@ -1589,7 +1589,7 @@ struct llama_server_context
break;
}
} break;
case CANCEL_TASK: { // release slot linked with the task id
case TASK_TYPE_CANCEL: { // release slot linked with the task id
for (auto & slot : slots)
{
if (slot.task_id == task.target_id)
Expand Down Expand Up @@ -2021,6 +2021,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
Expand Down Expand Up @@ -2081,7 +2082,28 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
sparams.api_key = argv[i];
sparams.api_keys.push_back(argv[i]);
}
else if (arg == "--api-key-file")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
std::ifstream key_file(argv[i]);
if (!key_file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::string key;
while (std::getline(key_file, key)) {
if (key.size() > 0) {
sparams.api_keys.push_back(key);
}
}
key_file.close();
}
else if (arg == "--timeout" || arg == "-to")
{
Expand Down Expand Up @@ -2515,7 +2537,7 @@ json oaicompat_completion_params_parse(
//
// https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("uknown"));
llama_params["model"] = json_value(body, "model", std::string("unknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.0);
Expand Down Expand Up @@ -2798,24 +2820,30 @@ int main(int argc, char **argv)

httplib::Server svr;

std::atomic<ServerState> server_state{LOADING_MODEL};
std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};

svr.set_default_headers({{"Server", "llama.cpp"},
{"Access-Control-Allow-Origin", "*"},
{"Access-Control-Allow-Headers", "content-type"}});
svr.set_default_headers({{"Server", "llama.cpp"}});

// CORS preflight
svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res.set_header("Access-Control-Allow-Credentials", "true");
res.set_header("Access-Control-Allow-Methods", "POST");
res.set_header("Access-Control-Allow-Headers", "*");
});

svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
ServerState current_state = server_state.load();
server_state current_state = state.load();
switch(current_state) {
case READY:
case SERVER_STATE_READY:
res.set_content(R"({"status": "ok"})", "application/json");
res.status = 200; // HTTP OK
break;
case LOADING_MODEL:
case SERVER_STATE_LOADING_MODEL:
res.set_content(R"({"status": "loading model"})", "application/json");
res.status = 503; // HTTP Service Unavailable
break;
case ERROR:
case SERVER_STATE_ERROR:
res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
res.status = 500; // HTTP Internal Server Error
break;
Expand Down Expand Up @@ -2881,8 +2909,10 @@ int main(int argc, char **argv)
log_data["hostname"] = sparams.hostname;
log_data["port"] = std::to_string(sparams.port);

if (!sparams.api_key.empty()) {
log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
if (sparams.api_keys.size() == 1) {
log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4);
} else if (sparams.api_keys.size() > 1) {
log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
}

LOG_INFO("HTTP server listening", log_data);
Expand All @@ -2891,7 +2921,7 @@ int main(int argc, char **argv)
{
if (!svr.listen_after_bind())
{
server_state.store(ERROR);
state.store(SERVER_STATE_ERROR);
return 1;
}

Expand All @@ -2901,17 +2931,18 @@ int main(int argc, char **argv)
// load the model
if (!llama.load_model(params))
{
server_state.store(ERROR);
state.store(SERVER_STATE_ERROR);
return 1;
} else {
llama.initialize();
server_state.store(READY);
state.store(SERVER_STATE_READY);
LOG_INFO("model loaded", {});
}

// Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation
if (sparams.api_key.empty()) {
if (sparams.api_keys.empty()) {
return true;
}

Expand All @@ -2920,7 +2951,7 @@ int main(int argc, char **argv)
std::string prefix = "Bearer ";
if (auth_header.substr(0, prefix.size()) == prefix) {
std::string received_api_key = auth_header.substr(prefix.size());
if (received_api_key == sparams.api_key) {
if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
return true; // API key is valid
}
}
Expand Down Expand Up @@ -2962,9 +2993,9 @@ int main(int argc, char **argv)
return false;
});

svr.Get("/props", [&llama](const httplib::Request & /*req*/, httplib::Response &res)
svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", "*");
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = {
{ "user_name", llama.name_user.c_str() },
{ "assistant_name", llama.name_assistant.c_str() }
Expand All @@ -2974,6 +3005,7 @@ int main(int argc, char **argv)

svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!validate_api_key(req, res)) {
return;
}
Expand Down Expand Up @@ -3041,8 +3073,9 @@ int main(int argc, char **argv)
}
});

svr.Get("/v1/models", [&params](const httplib::Request&, httplib::Response& res)
svr.Get("/v1/models", [&params](const httplib::Request& req, httplib::Response& res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
std::time_t t = std::time(0);

json models = {
Expand All @@ -3060,9 +3093,11 @@ int main(int argc, char **argv)
res.set_content(models.dump(), "application/json; charset=utf-8");
});


// TODO: add mount point without "/v1" prefix -- how?
svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!validate_api_key(req, res)) {
return;
}
Expand Down Expand Up @@ -3136,6 +3171,7 @@ int main(int argc, char **argv)

svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!validate_api_key(req, res)) {
return;
}
Expand Down Expand Up @@ -3208,6 +3244,7 @@ int main(int argc, char **argv)

svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
std::vector<llama_token> tokens;
if (body.count("content") != 0)
Expand All @@ -3220,6 +3257,7 @@ int main(int argc, char **argv)

svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
std::string content;
if (body.count("tokens") != 0)
Expand All @@ -3234,6 +3272,7 @@ int main(int argc, char **argv)

svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
json prompt;
if (body.count("content") != 0)
Expand Down
Loading

0 comments on commit 31fb4d8

Please sign in to comment.