From 1c3fdf8cfd7dce2c6fde8f1f35d8c14708d9ef34 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Tue, 23 May 2023 06:16:54 -0700 Subject: [PATCH 001/121] Add all generation parameters to server.cpp and allow resetting context sever.cpp left out a few generation parameters and also seems built to assume un-editable chatting with no regens or swipes. I added a simple "reload_ctx" flag that can be passed on generation that will cause the prompt to be reloaded. --- examples/server/server.cpp | 83 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7209a2b5232f0..644490f9be65a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -13,6 +13,8 @@ struct llama_server_context { bool as_loop = false; bool has_next_token = false; + + std::string generated_text = ""; int32_t num_tokens_predicted = 0; @@ -32,12 +34,33 @@ struct llama_server_context llama_context *ctx; gpt_params params; + bool reload_ctx = false; + void rewind() { as_loop = false; params.antiprompt.clear(); no_show_words.clear(); num_tokens_predicted = 0; generated_text = ""; + + if(reload_ctx) + { + if(processed_tokens.size() != 0) + { + processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); + } + + if(embd_inp.size() != 0) + { + embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); + } + + n_remain = 0; + n_past = 0; + n_consumed = 0; + + reload_ctx = false; + } } bool loadModel(gpt_params params_) @@ -58,6 +81,21 @@ struct llama_server_context bool loadPrompt() { params.prompt.insert(0, 1, ' '); // always add a first space + + if(processed_tokens.size() != 0) + { + processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); + } + + if(embd_inp.size() != 0) + { + embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); + } + + n_remain = 0; + n_past = 0; + n_consumed = 0; + std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt int new_prompt_len = 0; @@ -112,6 +150,7 @@ struct llama_server_context // Reset context const int n_left = n_past - params.n_keep; n_past = std::max(1, params.n_keep); + last_n_tokens.erase(last_n_tokens.begin() + n_past, last_n_tokens.end()); processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end()); embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size()); } @@ -518,10 +557,50 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.params.top_p = body["top_p"].get(); } + if (!body["tfs_z"].is_null()) + { + llama.params.tfs_z = body["tfs_z"].get(); + } + if (!body["typical_p"].is_null()) + { + llama.params.typical_p = body["typical_p"].get(); + } + if (!body["repeat_last_n"].is_null()) + { + llama.params.repeat_last_n = body["repeat_last_n"].get(); + } if (!body["temperature"].is_null()) { llama.params.temp = body["temperature"].get(); } + if (!body["repeat_penalty"].is_null()) + { + llama.params.repeat_penalty = body["repeat_penalty"].get(); + } + if (!body["presence_penalty"].is_null()) + { + llama.params.presence_penalty = body["presence_penalty"].get(); + } + if (!body["frequency_penalty"].is_null()) + { + llama.params.frequency_penalty = body["frequency_penalty"].get(); + } + if (!body["mirostat"].is_null()) + { + llama.params.mirostat = body["mirostat"].get(); + } + if (!body["mirostat_tau"].is_null()) + { + llama.params.mirostat_tau = body["mirostat_tau"].get(); + } + if (!body["mirostat_eta"].is_null()) + { + llama.params.mirostat_eta = body["mirostat_eta"].get(); + } + if (!body["penalize_nl"].is_null()) + { + llama.params.penalize_nl = body["penalize_nl"].get(); + } if (!body["batch_size"].is_null()) { llama.params.n_batch = body["batch_size"].get(); @@ -538,6 +617,10 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.params.interactive = body["interactive"].get(); } + if (!body["reload_ctx"].is_null()) + { + llama.reload_ctx = body["reload_ctx"].get(); + } if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); From 2071d730faa2e5c9e6dc21d32902cc389ab5c4e8 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Tue, 23 May 2023 06:22:30 -0700 Subject: [PATCH 002/121] Forgot to remove some testing code. --- examples/server/server.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 644490f9be65a..15bb9b7298ad8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -13,8 +13,6 @@ struct llama_server_context { bool as_loop = false; bool has_next_token = false; - - std::string generated_text = ""; int32_t num_tokens_predicted = 0; @@ -81,21 +79,6 @@ struct llama_server_context bool loadPrompt() { params.prompt.insert(0, 1, ' '); // always add a first space - - if(processed_tokens.size() != 0) - { - processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); - } - - if(embd_inp.size() != 0) - { - embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); - } - - n_remain = 0; - n_past = 0; - n_consumed = 0; - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt int new_prompt_len = 0; @@ -150,7 +133,6 @@ struct llama_server_context // Reset context const int n_left = n_past - params.n_keep; n_past = std::max(1, params.n_keep); - last_n_tokens.erase(last_n_tokens.begin() + n_past, last_n_tokens.end()); processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end()); embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size()); } From 421e66b330092c31d8322867d6bc7c6924b0ee0b Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Tue, 23 May 2023 07:34:36 -0700 Subject: [PATCH 003/121] Update examples/server/server.cpp Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 15bb9b7298ad8..a5ca57a5d1da6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -43,7 +43,7 @@ struct llama_server_context if(reload_ctx) { - if(processed_tokens.size() != 0) + if(!processed_tokens.empty()) { processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); } From add5f1bdc97ef425f319b63bc0c8ccce1b09ab1c Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Tue, 23 May 2023 07:34:41 -0700 Subject: [PATCH 004/121] Update examples/server/server.cpp Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a5ca57a5d1da6..f8586628abdf4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -48,7 +48,7 @@ struct llama_server_context processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); } - if(embd_inp.size() != 0) + if(!embd_inp.empty()) { embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); } From 8d7b28c28d07c3e9739f574c96843e1e0cff220f Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Tue, 23 May 2023 13:35:12 -0700 Subject: [PATCH 005/121] Fixed some types in the params. Quickly copy pasted without fixing them up. Whoopsies. --- examples/server/server.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f8586628abdf4..6cc07a9556079 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -549,7 +549,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["repeat_last_n"].is_null()) { - llama.params.repeat_last_n = body["repeat_last_n"].get(); + llama.params.repeat_last_n = body["repeat_last_n"].get(); } if (!body["temperature"].is_null()) { @@ -569,7 +569,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["mirostat"].is_null()) { - llama.params.mirostat = body["mirostat"].get(); + llama.params.mirostat = body["mirostat"].get(); } if (!body["mirostat_tau"].is_null()) { @@ -581,7 +581,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["penalize_nl"].is_null()) { - llama.params.penalize_nl = body["penalize_nl"].get(); + llama.params.penalize_nl = body["penalize_nl"].get(); } if (!body["batch_size"].is_null()) { From c2b55cc9171cbefdd0f5cb7c8cfc3d728245e61b Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Thu, 25 May 2023 12:53:05 -0700 Subject: [PATCH 006/121] Added LoRA Loading Someone please test this. I have no LoRAs available to test. The code is direct from the base repo so it should be fine. --- examples/server/server.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6cc07a9556079..d77d6b3031744 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -421,6 +421,8 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) fprintf(stderr, " number of layers to store in VRAM\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stderr, " -host ip address to listen (default 127.0.0.1)\n"); fprintf(stderr, " -port PORT port to listen (default 8080)\n"); fprintf(stderr, "\n"); @@ -505,6 +507,24 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } params.n_gpu_layers = std::stoi(argv[i]); } + else if (arg == "--lora") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } + else if (arg == "--lora-base") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); From 66ed19d01fa71b467cfef0a0928444c0e710aaa9 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Sat, 27 May 2023 11:51:21 -0700 Subject: [PATCH 007/121] Corrected dashes in the help lines. Co-authored-by: Henri Vasserman --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 72fb79592e6f9..4dddf50d30099 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -423,8 +423,8 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stderr, " -host ip address to listen (default 127.0.0.1)\n"); - fprintf(stderr, " -port PORT port to listen (default 8080)\n"); + fprintf(stderr, " --host ip address to listen (default 127.0.0.1)\n"); + fprintf(stderr, " --port PORT port to listen (default 8080)\n"); fprintf(stderr, "\n"); } From 36c86d794dd3981160c9ec786ac6825231435137 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Sat, 27 May 2023 16:43:08 -0700 Subject: [PATCH 008/121] Automate Context resetting and minor fixes Fixed top_k still not being set. Removed an unnecessary loop. --- examples/server/server.cpp | 56 ++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4dddf50d30099..d5a1473f18ba9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -29,36 +29,17 @@ struct llama_server_context std::vector> no_show_words; std::vector tokens_predicted; + std::vector last_prompt_tokens; + llama_context *ctx; gpt_params params; - bool reload_ctx = false; - void rewind() { as_loop = false; params.antiprompt.clear(); no_show_words.clear(); num_tokens_predicted = 0; generated_text = ""; - - if(reload_ctx) - { - if(!processed_tokens.empty()) - { - processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); - } - - if(!embd_inp.empty()) - { - embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); - } - - n_remain = 0; - n_past = 0; - n_consumed = 0; - - reload_ctx = false; - } } bool loadModel(gpt_params params_) @@ -82,6 +63,28 @@ struct llama_server_context std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt int new_prompt_len = 0; + if (last_prompt_tokens == prompt_tokens) + { + //fprintf(stdout, "Context matched.\n"); + processed_tokens = last_prompt_tokens; + embd_inp = last_prompt_tokens; + n_past = processed_tokens.size(); + n_consumed = last_prompt_tokens.size() - 2; + last_prompt_tokens = prompt_tokens; + has_next_token = true; + return true; + } + else + { + if (!processed_tokens.empty() && !embd_inp.empty()) + { + //fprintf(stdout, "Resetting context.\n"); + processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); + embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); + n_consumed = 0; + n_past = 0; + } + } for (size_t i = 0; i < prompt_tokens.size(); i++) { if (i < processed_tokens.size() && processed_tokens[i] == prompt_tokens[i]) @@ -159,6 +162,7 @@ struct llama_server_context const float temp = params.temp; // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; const float top_p = params.top_p; + const float top_k = params.top_k; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; @@ -229,6 +233,7 @@ struct llama_server_context llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); llama_sample_typical(ctx, &candidates_p, typical_p, 1); llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_top_k(ctx, &candidates_p, top_k, 1); llama_sample_temperature(ctx, &candidates_p, temp); id = llama_sample_token(ctx, &candidates_p); } @@ -253,10 +258,7 @@ struct llama_server_context // add it to the context embd.push_back(id); - for (auto id : embd) - { - result = id; - } + result = id; // decrement remaining sampling budget --n_remain; } @@ -619,10 +621,6 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.params.interactive = body["interactive"].get(); } - if (!body["reload_ctx"].is_null()) - { - llama.reload_ctx = body["reload_ctx"].get(); - } if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); From d20f36b93ca6ed3bfc64ec72c734f02e49856951 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Sat, 27 May 2023 16:46:05 -0700 Subject: [PATCH 009/121] Removed unnecessary last_prompt_token set Added the one that was supposed to be there. Apologies for the extra commits, I'm copy pasting from my editor to preserve the two-space indent formatting. --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d5a1473f18ba9..bd1bc07aed6c2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -70,7 +70,6 @@ struct llama_server_context embd_inp = last_prompt_tokens; n_past = processed_tokens.size(); n_consumed = last_prompt_tokens.size() - 2; - last_prompt_tokens = prompt_tokens; has_next_token = true; return true; } @@ -111,6 +110,7 @@ struct llama_server_context { return false; } + last_prompt_tokens = prompt_tokens; has_next_token = true; return true; } From e84b80216142a5d0401bcda69abce5f69dda022e Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Sat, 27 May 2023 17:07:45 -0700 Subject: [PATCH 010/121] Change top_k type. Is my lack of knowledge of the code base showing? Yes it is. --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index bd1bc07aed6c2..8942552a57724 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -162,7 +162,7 @@ struct llama_server_context const float temp = params.temp; // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; const float top_p = params.top_p; - const float top_k = params.top_k; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; From 1f40a789e61ca052a71748a42c38f0d0ab9f7ead Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Sat, 27 May 2023 17:10:09 -0700 Subject: [PATCH 011/121] Didn't see the already defined top_k var. lol. Embarrassing. Don't edit code in the github web viewer, kids. --- examples/server/server.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8942552a57724..356ec1b7bb1e1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -160,9 +160,8 @@ struct llama_server_context { // out of user input, sample next token const float temp = params.temp; - // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; - const float top_p = params.top_p; const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; From 51e09944cef163c5b8bb88d91a25541e5fa606e3 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 28 May 2023 02:42:18 +0300 Subject: [PATCH 012/121] server rewrite Remove unnecessary things and radically rewrite server --- examples/server/server.cpp | 202 +++++++------------------------------ 1 file changed, 35 insertions(+), 167 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 356ec1b7bb1e1..243da3564ed88 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -15,19 +15,16 @@ struct llama_server_context bool has_next_token = false; std::string generated_text = ""; - int32_t num_tokens_predicted = 0; - int32_t n_past = 0; - int32_t n_consumed = 0; - int32_t n_session_consumed = 0; - int32_t n_remain = 0; + size_t num_tokens_predicted = 0; + size_t n_past = 0; + size_t n_consumed = 0; + size_t n_session_consumed = 0; + size_t n_remain = 0; std::vector embd; std::vector last_n_tokens; std::vector processed_tokens; - std::vector llama_token_newline; std::vector embd_inp; - std::vector> no_show_words; - std::vector tokens_predicted; std::vector last_prompt_tokens; @@ -37,9 +34,14 @@ struct llama_server_context void rewind() { as_loop = false; params.antiprompt.clear(); - no_show_words.clear(); num_tokens_predicted = 0; generated_text = ""; + + //processed_tokens.clear(); + embd_inp.clear(); + n_remain = 0; + n_past = 0; + n_consumed = 0; } bool loadModel(gpt_params params_) @@ -51,8 +53,7 @@ struct llama_server_context fprintf(stderr, "%s: error: unable to load model\n", __func__); return false; } - // determine newline token - llama_token_newline = ::llama_tokenize(ctx, "\n", false); + last_n_tokens.resize(params.n_ctx); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); return true; @@ -62,53 +63,14 @@ struct llama_server_context params.prompt.insert(0, 1, ' '); // always add a first space std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt - int new_prompt_len = 0; - if (last_prompt_tokens == prompt_tokens) - { - //fprintf(stdout, "Context matched.\n"); - processed_tokens = last_prompt_tokens; - embd_inp = last_prompt_tokens; - n_past = processed_tokens.size(); - n_consumed = last_prompt_tokens.size() - 2; - has_next_token = true; - return true; - } - else - { - if (!processed_tokens.empty() && !embd_inp.empty()) - { - //fprintf(stdout, "Resetting context.\n"); - processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); - embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); - n_consumed = 0; - n_past = 0; - } - } - for (size_t i = 0; i < prompt_tokens.size(); i++) { - if (i < processed_tokens.size() && - processed_tokens[i] == prompt_tokens[i]) - { - continue; - } - else - { - embd_inp.push_back(prompt_tokens[i]); - if(new_prompt_len == 0) { - if(int32_t(i) - 1 < n_past) { - processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end()); - } - // Evaluate the new fragment prompt from the last token processed. - n_past = processed_tokens.size(); - } - new_prompt_len ++; + for (n_past = 0; n_past < prompt_tokens.size() - 1 && n_past < processed_tokens.size(); n_past++) { + if (prompt_tokens[n_past] != processed_tokens[n_past]) { + break; } } - if(n_past > 0 && params.interactive) { - n_remain -= new_prompt_len; - } - if ((int)embd_inp.size() > params.n_ctx - 4) - { - return false; + processed_tokens.resize(n_past); + if (prompt_tokens.size() > n_past) { + embd_inp.insert(embd_inp.end(), prompt_tokens.begin() + n_past, prompt_tokens.end()); } last_prompt_tokens = prompt_tokens; has_next_token = true; @@ -131,7 +93,7 @@ struct llama_server_context llama_token result = -1; if (embd.size() > 0) { - if (n_past + (int)embd.size() > params.n_ctx) + if (n_past + embd.size() > (size_t)params.n_ctx) { // Reset context const int n_left = n_past - params.n_keep; @@ -156,7 +118,7 @@ struct llama_server_context } } embd.clear(); - if ((int)embd_inp.size() <= n_consumed && has_next_token) + if (embd_inp.size() <= n_consumed) { // out of user input, sample next token const float temp = params.temp; @@ -243,18 +205,6 @@ struct llama_server_context num_tokens_predicted++; } - // replace end of text token with newline token when in interactive mode - if (id == llama_token_eos() && params.interactive) - { - id = llama_token_newline.front(); - if (params.antiprompt.size() != 0) - { - // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); - embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); - } - } - // add it to the context embd.push_back(id); result = id; @@ -264,7 +214,7 @@ struct llama_server_context else { // some user input remains from prompt or interaction, forward it to processing - while ((int)embd_inp.size() > n_consumed) + while (embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); last_n_tokens.erase(last_n_tokens.begin()); @@ -277,41 +227,11 @@ struct llama_server_context } } } - if (params.interactive && (int)embd_inp.size() <= n_consumed) - { - // check for reverse prompt - if (params.antiprompt.size()) - { - std::string last_output; - for (auto id : last_n_tokens) - { - last_output += llama_token_to_str(ctx, id); - } - has_next_token = true; - // Check if each of the reverse prompts appears at the end of the output. - for (std::string &antiprompt : params.antiprompt) - { - if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) - { - has_next_token = false; - return result; - } - } - } - if (n_past > 0) - { - has_next_token = true; - } - } if (!embd.empty() && embd.back() == llama_token_eos()) { has_next_token = false; } - if (params.interactive && n_remain <= 0 && params.n_predict != -1) - { - n_remain = params.n_predict; - } has_next_token = n_remain != 0; return result; } @@ -322,58 +242,23 @@ struct llama_server_context if (token == -1) { return ""; } - tokens_predicted.clear(); - tokens_predicted.push_back(token); - - // Avoid add the no show words to the response - for (std::vector word_tokens : no_show_words) - { - size_t match_token = 1; - if (tokens_predicted.front() == word_tokens.front()) - { - bool execute_matching = true; - if (tokens_predicted.size() > 1) { // if previus tokens had been tested - for (size_t i = 1; i < word_tokens.size(); i++) - { - if (i >= tokens_predicted.size()) { - match_token = i; - break; - } - if (tokens_predicted[i] == word_tokens[i]) - { - continue; - } - else - { - execute_matching = false; - break; - } - } - } - while (execute_matching) { - if (match_token == word_tokens.size()) { - return ""; - } - token = nextToken(); - tokens_predicted.push_back(token); - if (token == word_tokens[match_token]) - { // the token follow the sequence - match_token++; - } - else if (match_token < word_tokens.size()) - { // no complete all word sequence - break; - } - } - } - } + if(as_loop) { generated_text = ""; } - for (llama_token tkn : tokens_predicted) - { - generated_text += llama_token_to_str(ctx, tkn); + + std::string token_text = llama_token_to_str(ctx, token); + generated_text += token_text; + + for (std::string word : params.antiprompt) { + size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size())); + if (i != std::string::npos) { + generated_text.erase(generated_text.begin() + i, generated_text.begin() + i + word.size()); + has_next_token = false; + break; + } } + return generated_text; } @@ -616,10 +501,6 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.as_loop = body["as_loop"].get(); } - if (!body["interactive"].is_null()) - { - llama.params.interactive = body["interactive"].get(); - } if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); @@ -635,20 +516,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["stop"].is_null()) { - std::vector stop_words = body["stop"].get>(); - for (std::string stop_word : stop_words) - { - llama.params.antiprompt.push_back(stop_word); - llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false)); - } - } - if (!body["exclude"].is_null()) - { - std::vector no_show_words = body["exclude"].get>(); - for (std::string no_show : no_show_words) - { - llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false)); - } + llama.params.antiprompt = body["stop"].get>(); } return true; } From f93fe36c5ba9b135d2cdd3bfb85a5becd14955ae Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Tue, 23 May 2023 06:16:54 -0700 Subject: [PATCH 013/121] Add all generation parameters to server.cpp and allow resetting context sever.cpp left out a few generation parameters and also seems built to assume un-editable chatting with no regens or swipes. I added a simple "reload_ctx" flag that can be passed on generation that will cause the prompt to be reloaded. --- examples/server/server.cpp | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 243da3564ed88..eccb2f2ad7c2f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -13,6 +13,8 @@ struct llama_server_context { bool as_loop = false; bool has_next_token = false; + + std::string generated_text = ""; size_t num_tokens_predicted = 0; @@ -31,6 +33,8 @@ struct llama_server_context llama_context *ctx; gpt_params params; + bool reload_ctx = false; + void rewind() { as_loop = false; params.antiprompt.clear(); @@ -61,6 +65,21 @@ struct llama_server_context bool loadPrompt() { params.prompt.insert(0, 1, ' '); // always add a first space + + if(processed_tokens.size() != 0) + { + processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); + } + + if(embd_inp.size() != 0) + { + embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); + } + + n_remain = 0; + n_past = 0; + n_consumed = 0; + std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt for (n_past = 0; n_past < prompt_tokens.size() - 1 && n_past < processed_tokens.size(); n_past++) { @@ -98,6 +117,7 @@ struct llama_server_context // Reset context const int n_left = n_past - params.n_keep; n_past = std::max(1, params.n_keep); + last_n_tokens.erase(last_n_tokens.begin() + n_past, last_n_tokens.end()); processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end()); embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size()); } @@ -455,7 +475,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["repeat_last_n"].is_null()) { - llama.params.repeat_last_n = body["repeat_last_n"].get(); + llama.params.repeat_last_n = body["repeat_last_n"].get(); } if (!body["temperature"].is_null()) { @@ -475,7 +495,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["mirostat"].is_null()) { - llama.params.mirostat = body["mirostat"].get(); + llama.params.mirostat = body["mirostat"].get(); } if (!body["mirostat_tau"].is_null()) { @@ -487,7 +507,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["penalize_nl"].is_null()) { - llama.params.penalize_nl = body["penalize_nl"].get(); + llama.params.penalize_nl = body["penalize_nl"].get(); } if (!body["batch_size"].is_null()) { From df0e0d094ca8e24bf144ceeabb3a6c4a297803dc Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Tue, 23 May 2023 06:22:30 -0700 Subject: [PATCH 014/121] Forgot to remove some testing code. --- examples/server/server.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eccb2f2ad7c2f..1ee6ce1d1e0bb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -13,8 +13,6 @@ struct llama_server_context { bool as_loop = false; bool has_next_token = false; - - std::string generated_text = ""; size_t num_tokens_predicted = 0; @@ -65,21 +63,6 @@ struct llama_server_context bool loadPrompt() { params.prompt.insert(0, 1, ' '); // always add a first space - - if(processed_tokens.size() != 0) - { - processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); - } - - if(embd_inp.size() != 0) - { - embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); - } - - n_remain = 0; - n_past = 0; - n_consumed = 0; - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt for (n_past = 0; n_past < prompt_tokens.size() - 1 && n_past < processed_tokens.size(); n_past++) { @@ -117,7 +100,6 @@ struct llama_server_context // Reset context const int n_left = n_past - params.n_keep; n_past = std::max(1, params.n_keep); - last_n_tokens.erase(last_n_tokens.begin() + n_past, last_n_tokens.end()); processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end()); embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size()); } From 549291fe61f51f813f9b44a6dcb1f03fa7074858 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 28 May 2023 12:08:37 +0300 Subject: [PATCH 015/121] keep processed from the beginning this means no limit to the input prompt, it will just get reset again as normal --- examples/server/server.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1ee6ce1d1e0bb..c95226e663f72 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -100,7 +100,7 @@ struct llama_server_context // Reset context const int n_left = n_past - params.n_keep; n_past = std::max(1, params.n_keep); - processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end()); + //processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end()); embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size()); } for (int i = 0; i < (int)embd.size(); i += params.n_batch) @@ -499,10 +499,6 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.params.n_keep = body["n_keep"].get(); } - if (!body["as_loop"].is_null()) - { - llama.as_loop = body["as_loop"].get(); - } if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); From 177868e68a135d03d4654a55bc2aad57c9a39e7a Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 06:29:11 -0400 Subject: [PATCH 016/121] Changed to params/args Seed is now set by the CLI, defaults to -1 if not seed is set Threads and batch size are now properly launch parameters. --- examples/server/server.cpp | 43 +++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c95226e663f72..2ab5327632111 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -293,7 +293,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); fprintf(stderr, " --embedding enable embedding mode\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); @@ -343,18 +343,6 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } sparams.hostname = argv[i]; } - else if (arg == "-s" || arg == "--seed") - { -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n"); -#endif - if (++i >= argc) - { - invalid_param = true; - break; - } - params.seed = std::stoi(argv[i]); - } else if (arg == "-m" || arg == "--model") { if (++i >= argc) @@ -386,6 +374,23 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para { params.memory_f16 = false; } + else if (arg == "--threads" || arg == "-t") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } + else if (arg == "-b" || arg == "--batch-size") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { if (++i >= argc) @@ -491,14 +496,18 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.params.penalize_nl = body["penalize_nl"].get(); } - if (!body["batch_size"].is_null()) - { - llama.params.n_batch = body["batch_size"].get(); - } if (!body["n_keep"].is_null()) { llama.params.n_keep = body["n_keep"].get(); } + if (!body["seed"].is_null()) + { + llama.params.seed = body["seed"].get(); + } + else + { + llama.params.seed = -1; + } if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); From e8efd75492f45aeb2a5fa12eba3b417b30028a3d Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 07:44:31 -0400 Subject: [PATCH 017/121] Initial timeout code and expanded json return on completion. Now passing server params to the help printer so they defaults are ouput. Bad UTF while streaming now returns a replacement character (\uFFFD) Changed some error language very slightly. The JSON now returns extra values, only on `stop` for streaming requests. New JSON Return Values: - tokens_predicted (added to streaming) - seed (just pulls it from params, might return -1) - prompt (Might be useful) - generated_text (Full generated response for streaming requests) --- examples/server/server.cpp | 99 +++++++++++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 24 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2ab5327632111..01bbd92361ff0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -7,6 +7,8 @@ struct server_params { std::string hostname = "127.0.0.1"; int32_t port = 8080; + int32_t read_timeout = 600; + int32_t write_timeout = 600; }; struct llama_server_context @@ -287,7 +289,7 @@ using namespace httplib; using json = nlohmann::json; -void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) +void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, const server_params &sparams) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); @@ -311,14 +313,16 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stderr, " --host ip address to listen (default 127.0.0.1)\n"); - fprintf(stderr, " --port PORT port to listen (default 8080)\n"); + fprintf(stderr, " --host ip address to listen (default (default: %d)\n", sparams.hostname); + fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); + fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); fprintf(stderr, "\n"); } bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_params ¶ms) { gpt_params default_params; + server_params default_sparams; std::string arg; bool invalid_param = false; @@ -343,6 +347,15 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } sparams.hostname = argv[i]; } + else if (arg == "--timeout" || arg == "-to") + { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { if (++i >= argc) @@ -358,7 +371,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } else if (arg == "-h" || arg == "--help") { - server_print_usage(argc, argv, default_params); + server_print_usage(argc, argv, default_params, default_sparams); exit(0); } else if (arg == "-c" || arg == "--ctx_size") @@ -421,7 +434,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params); + server_print_usage(argc, argv, default_params, default_sparams); exit(1); } } @@ -429,7 +442,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params); + server_print_usage(argc, argv, default_params, default_sparams); exit(1); } return true; @@ -538,18 +551,13 @@ int main(int argc, char **argv) llama_server_context llama; params.model = "ggml-model.bin"; + std::string final_text = ""; + if (server_params_parse(argc, argv, sparams, params) == false) { return 1; } - if (params.seed <= 0) - { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - // load the model if (!llama.loadModel(params)) { @@ -561,18 +569,19 @@ int main(int argc, char **argv) svr.Get("/", [](const Request &, Response &res) { res.set_content("

llama.cpp server works

", "text/html"); }); - svr.Post("/completion", [&llama](const Request &req, Response &res) + svr.Post("/completion", [&llama, &final_text](const Request &req, Response &res) { if(llama.params.embedding) { json data = { {"status", "error"}, - {"reason", "To use completion function disable embedding mode"}}; + {"reason", "To use completion function, disable embedding mode"}}; res.set_content(data.dump(), "application/json"); res.status = 400; return; } llama.rewind(); + final_text = ""; if(parse_options_completion(json::parse(req.body), llama, res) == false){ return; @@ -582,7 +591,7 @@ int main(int argc, char **argv) { json data = { {"status", "error"}, - {"reason", "Context too long, please be more specific"}}; + {"reason", "Context too long."}}; res.set_content(data.dump(), "application/json"); res.status = 400; return; @@ -603,7 +612,9 @@ int main(int argc, char **argv) { json data = { {"content", llama.generated_text }, - {"tokens_predicted", llama.num_tokens_predicted}}; + {"tokens_predicted", llama.num_tokens_predicted}, + {"seed", llama.params.seed}, + {"prompt", llama.params.prompt} }; return res.set_content(data.dump(), "application/json"); } catch (const json::exception &e) @@ -641,7 +652,7 @@ int main(int argc, char **argv) return res.set_content(data.dump(), "application/json"); }); - svr.Get("/next-token", [&llama](const Request &req, Response &res) + svr.Get("/next-token", [&llama, &final_text](const Request &req, Response &res) { if(llama.params.embedding) { res.set_content("{}", "application/json"); @@ -654,15 +665,52 @@ int main(int argc, char **argv) result = llama.doCompletion(); // inference next token } try { - json data = { + json data; + if (llama.has_next_token) + { + final_text += result; + data = { + {"content", result }, + {"stop", false } + }; + } + else + { + // Generation is done, send extra information. + data = { {"content", result }, - {"stop", !llama.has_next_token }}; + {"stop", true }, + {"tokens_predicted", llama.num_tokens_predicted}, + {"seed", llama.params.seed}, + {"prompt", llama.params.prompt}, + {"generated_text", final_text} + }; + } + return res.set_content(data.dump(), "application/json"); } catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive - json data = { - {"content", "" }, - {"stop", !llama.has_next_token }}; + json data; + if (llama.has_next_token) + { + final_text += u8"\uFFFD"; + data = { + {"content", result }, + {"stop", false } + }; + } + else + { + // Generation is done, send extra information. + data = { + {"content", "\uFFFD" }, + {"stop", true }, + {"tokens_predicted", llama.num_tokens_predicted}, + {"seed", llama.params.seed}, + {"prompt", llama.params.prompt}, + {"generated_text", final_text} + }; + } return res.set_content(data.dump(), "application/json"); } }); @@ -673,6 +721,9 @@ int main(int argc, char **argv) fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n"); } - // change hostname and port + // set timeouts and change hostname and port + svr.set_read_timeout(sparams.read_timeout); + svr.set_write_timeout(sparams.write_timeout); svr.listen(sparams.hostname, sparams.port); + } From 23928f28873139277452c904c9fee453cdf8142e Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 08:04:05 -0400 Subject: [PATCH 018/121] Added generation_settings to final json object. --- examples/server/server.cpp | 50 +++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 01bbd92361ff0..b3bec35f1649b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -614,6 +614,22 @@ int main(int argc, char **argv) {"content", llama.generated_text }, {"tokens_predicted", llama.num_tokens_predicted}, {"seed", llama.params.seed}, + {"generation_settings", { + "temp", llama.params.temp, + "top_k", llama.params.top_k, + "top_p", llama.params.top_p, + "tfs_z", llama.params.tfs_z, + "typical_p", llama.params.typical_p, + "repeat_last_n", llama.params.repeat_last_n, + "repeat_penalty", llama.params.repeat_penalty, + "alpha_presence", llama.params.presence_penalty, + "alpha_frequency", llama.params.frequency_penalty, + "mirostat", llama.params.mirostat, + "mirostat_tau", llama.params.mirostat_tau, + "mirostat_eta", llama.params.mirostat_eta, + "penalize_nl", llama.params.penalize_nl + } + }, {"prompt", llama.params.prompt} }; return res.set_content(data.dump(), "application/json"); } @@ -682,11 +698,27 @@ int main(int argc, char **argv) {"stop", true }, {"tokens_predicted", llama.num_tokens_predicted}, {"seed", llama.params.seed}, + {"generation_settings", { + "temp", llama.params.temp, + "top_k", llama.params.top_k, + "top_p", llama.params.top_p, + "tfs_z", llama.params.tfs_z, + "typical_p", llama.params.typical_p, + "repeat_last_n", llama.params.repeat_last_n, + "repeat_penalty", llama.params.repeat_penalty, + "alpha_presence", llama.params.presence_penalty, + "alpha_frequency", llama.params.frequency_penalty, + "mirostat", llama.params.mirostat, + "mirostat_tau", llama.params.mirostat_tau, + "mirostat_eta", llama.params.mirostat_eta, + "penalize_nl", llama.params.penalize_nl + } + }, {"prompt", llama.params.prompt}, {"generated_text", final_text} }; } - + return res.set_content(data.dump(), "application/json"); } catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive @@ -707,6 +739,22 @@ int main(int argc, char **argv) {"stop", true }, {"tokens_predicted", llama.num_tokens_predicted}, {"seed", llama.params.seed}, + {"generation_settings", { + "temp", llama.params.temp, + "top_k", llama.params.top_k, + "top_p", llama.params.top_p, + "tfs_z", llama.params.tfs_z, + "typical_p", llama.params.typical_p, + "repeat_last_n", llama.params.repeat_last_n, + "repeat_penalty", llama.params.repeat_penalty, + "alpha_presence", llama.params.presence_penalty, + "alpha_frequency", llama.params.frequency_penalty, + "mirostat", llama.params.mirostat, + "mirostat_tau", llama.params.mirostat_tau, + "mirostat_eta", llama.params.mirostat_eta, + "penalize_nl", llama.params.penalize_nl + } + }, {"prompt", llama.params.prompt}, {"generated_text", final_text} }; From 2e5c5ee224ab42f6ab6ca26344cef2cb044a84d5 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 08:12:48 -0400 Subject: [PATCH 019/121] Changed JSON names to match the parameter name rather than the variable name. --- examples/server/server.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b3bec35f1649b..0286fcc5b43d7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -622,8 +622,8 @@ int main(int argc, char **argv) "typical_p", llama.params.typical_p, "repeat_last_n", llama.params.repeat_last_n, "repeat_penalty", llama.params.repeat_penalty, - "alpha_presence", llama.params.presence_penalty, - "alpha_frequency", llama.params.frequency_penalty, + "presence_penalty", llama.params.presence_penalty, + "frequency_penalty", llama.params.frequency_penalty, "mirostat", llama.params.mirostat, "mirostat_tau", llama.params.mirostat_tau, "mirostat_eta", llama.params.mirostat_eta, @@ -706,8 +706,8 @@ int main(int argc, char **argv) "typical_p", llama.params.typical_p, "repeat_last_n", llama.params.repeat_last_n, "repeat_penalty", llama.params.repeat_penalty, - "alpha_presence", llama.params.presence_penalty, - "alpha_frequency", llama.params.frequency_penalty, + "presence_penalty", llama.params.presence_penalty, + "frequency_penalty", llama.params.frequency_penalty, "mirostat", llama.params.mirostat, "mirostat_tau", llama.params.mirostat_tau, "mirostat_eta", llama.params.mirostat_eta, @@ -747,8 +747,8 @@ int main(int argc, char **argv) "typical_p", llama.params.typical_p, "repeat_last_n", llama.params.repeat_last_n, "repeat_penalty", llama.params.repeat_penalty, - "alpha_presence", llama.params.presence_penalty, - "alpha_frequency", llama.params.frequency_penalty, + "presence_penalty", llama.params.presence_penalty, + "frequency_penalty", llama.params.frequency_penalty, "mirostat", llama.params.mirostat, "mirostat_tau", llama.params.mirostat_tau, "mirostat_eta", llama.params.mirostat_eta, From dda915cac4ad2116d87ea4296bdc4757a155dd4a Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 08:43:38 -0400 Subject: [PATCH 020/121] Added capturing the stopping word and sending it along with the final JSON. Fixed an fprintf warning Fixed a bug that broke streaming Properly removed thread changing in json (only grabbed batch_size before) --- examples/server/server.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0286fcc5b43d7..54be938fc8aad 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -33,13 +33,14 @@ struct llama_server_context llama_context *ctx; gpt_params params; - bool reload_ctx = false; + std::string stopping_word = ""; void rewind() { as_loop = false; params.antiprompt.clear(); num_tokens_predicted = 0; generated_text = ""; + stopping_word = ""; //processed_tokens.clear(); embd_inp.clear(); @@ -233,6 +234,7 @@ struct llama_server_context } if (!embd.empty() && embd.back() == llama_token_eos()) { + stopping_word = llama_token_to_str(ctx, embd.back()); has_next_token = false; } @@ -258,6 +260,7 @@ struct llama_server_context size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size())); if (i != std::string::npos) { generated_text.erase(generated_text.begin() + i, generated_text.begin() + i + word.size()); + stopping_word = word; has_next_token = false; break; } @@ -313,7 +316,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stderr, " --host ip address to listen (default (default: %d)\n", sparams.hostname); + fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); fprintf(stderr, "\n"); @@ -449,9 +452,9 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } bool parse_options_completion(json body, llama_server_context& llama, Response &res) { - if (!body["threads"].is_null()) + if (!body["as_loop"].is_null()) { - llama.params.n_threads = body["threads"].get(); + llama.as_loop = body["as_loop"].get(); } if (!body["n_predict"].is_null()) { @@ -475,7 +478,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["repeat_last_n"].is_null()) { - llama.params.repeat_last_n = body["repeat_last_n"].get(); + llama.params.repeat_last_n = body["repeat_last_n"].get(); } if (!body["temperature"].is_null()) { @@ -630,7 +633,8 @@ int main(int argc, char **argv) "penalize_nl", llama.params.penalize_nl } }, - {"prompt", llama.params.prompt} }; + {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word} }; return res.set_content(data.dump(), "application/json"); } catch (const json::exception &e) @@ -684,6 +688,7 @@ int main(int argc, char **argv) json data; if (llama.has_next_token) { + //fprintf(stdout, "Result: %s\n", result); final_text += result; data = { {"content", result }, @@ -715,6 +720,7 @@ int main(int argc, char **argv) } }, {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word}, {"generated_text", final_text} }; } @@ -735,7 +741,7 @@ int main(int argc, char **argv) { // Generation is done, send extra information. data = { - {"content", "\uFFFD" }, + {"content", u8"\uFFFD" }, {"stop", true }, {"tokens_predicted", llama.num_tokens_predicted}, {"seed", llama.params.seed}, @@ -756,6 +762,7 @@ int main(int argc, char **argv) } }, {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word}, {"generated_text", final_text} }; } From 7740301db9be2e9ef2b6b383a5be4d7939c1edc9 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 09:18:47 -0400 Subject: [PATCH 021/121] Set unspecified generation settings back to default. (Notes below) - If a given set of values coming along doesn't contain top_k for example, but did before, it would have stayed on the old value, I'm pretty sure. This fixes that. - I don't know if this could be done a bit prettier by just setting llama.params = gpt_params(); since I'm not sure how the default constructor would react since there's not one defined. --- examples/server/server.cpp | 95 ++++++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 15 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 54be938fc8aad..bef3b4090f51c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -452,69 +452,130 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } bool parse_options_completion(json body, llama_server_context& llama, Response &res) { + gpt_params default_params; if (!body["as_loop"].is_null()) { llama.as_loop = body["as_loop"].get(); } if (!body["n_predict"].is_null()) { - llama.params.n_predict = body["n_predict"].get(); + llama.params.n_predict = body["n_predict"].get(); + } + else + { + llama.params.n_predict = default_params.n_predict; } if (!body["top_k"].is_null()) { - llama.params.top_k = body["top_k"].get(); + llama.params.top_k = body["top_k"].get(); + } + else + { + llama.params.top_k = default_params.top_k; } if (!body["top_p"].is_null()) { - llama.params.top_p = body["top_p"].get(); + llama.params.top_p = body["top_p"].get(); + } + else + { + llama.params.top_p = default_params.top_p; } if (!body["tfs_z"].is_null()) { - llama.params.tfs_z = body["tfs_z"].get(); + llama.params.tfs_z = body["tfs_z"].get(); + } + else + { + llama.params.tfs_z = default_params.tfs_z; } if (!body["typical_p"].is_null()) { - llama.params.typical_p = body["typical_p"].get(); + llama.params.typical_p = body["typical_p"].get(); + } + else + { + llama.params.typical_p = default_params.typical_p; } if (!body["repeat_last_n"].is_null()) { - llama.params.repeat_last_n = body["repeat_last_n"].get(); + llama.params.repeat_last_n = body["repeat_last_n"].get(); + } + else + { + llama.params.repeat_last_n = default_params.repeat_last_n; } if (!body["temperature"].is_null()) { - llama.params.temp = body["temperature"].get(); + llama.params.temp = body["temperature"].get(); + } + else + { + llama.params.temp = default_params.temp; } if (!body["repeat_penalty"].is_null()) { - llama.params.repeat_penalty = body["repeat_penalty"].get(); + llama.params.repeat_penalty = body["repeat_penalty"].get(); + } + else + { + llama.params.repeat_penalty = default_params.repeat_penalty; } if (!body["presence_penalty"].is_null()) { - llama.params.presence_penalty = body["presence_penalty"].get(); + llama.params.presence_penalty = body["presence_penalty"].get(); + } + else + { + llama.params.presence_penalty = default_params.presence_penalty; } if (!body["frequency_penalty"].is_null()) { - llama.params.frequency_penalty = body["frequency_penalty"].get(); + llama.params.frequency_penalty = body["frequency_penalty"].get(); + } + else + { + llama.params.frequency_penalty = default_params.frequency_penalty; } if (!body["mirostat"].is_null()) { - llama.params.mirostat = body["mirostat"].get(); + llama.params.mirostat = body["mirostat"].get(); + } + else + { + llama.params.mirostat = default_params.mirostat; } if (!body["mirostat_tau"].is_null()) { - llama.params.mirostat_tau = body["mirostat_tau"].get(); + llama.params.mirostat_tau = body["mirostat_tau"].get(); + } + else + { + llama.params.mirostat_tau = default_params.mirostat_tau; } if (!body["mirostat_eta"].is_null()) { - llama.params.mirostat_eta = body["mirostat_eta"].get(); + llama.params.mirostat_eta = body["mirostat_eta"].get(); + } + else + { + llama.params.mirostat_eta = default_params.mirostat_eta; } if (!body["penalize_nl"].is_null()) { - llama.params.penalize_nl = body["penalize_nl"].get(); + llama.params.penalize_nl = body["penalize_nl"].get(); + } + else + { + llama.params.penalize_nl = default_params.penalize_nl; } if (!body["n_keep"].is_null()) { - llama.params.n_keep = body["n_keep"].get(); + llama.params.n_keep = body["n_keep"].get(); + } + else + { + llama.params.n_keep = default_params.n_keep; } if (!body["seed"].is_null()) { @@ -541,6 +602,10 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.params.antiprompt = body["stop"].get>(); } + else + { + llama.params.antiprompt.clear(); + } return true; } From 7186d655a103c710c168624daf3e7c758586b78b Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 28 May 2023 17:03:01 +0300 Subject: [PATCH 022/121] seed and gen params --- examples/server/server.cpp | 78 ++++++++++++-------------------------- 1 file changed, 25 insertions(+), 53 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 54be938fc8aad..8d863342c1671 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -92,6 +92,7 @@ struct llama_server_context } } n_remain = params.n_predict; + llama_set_rng_seed(ctx, params.seed); } llama_token nextToken() { @@ -518,11 +519,11 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["seed"].is_null()) { - llama.params.seed = body["seed"].get(); + llama.params.seed = body["seed"].get(); } else { - llama.params.seed = -1; + llama.params.seed = time(NULL); } if (!body["prompt"].is_null()) { @@ -544,6 +545,25 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & return true; } +json format_generation_settings(const llama_server_context& llama) { + return json { + { "seed", llama.params.seed }, + { "temp", llama.params.temp }, + { "top_k", llama.params.top_k }, + { "top_p", llama.params.top_p }, + { "tfs_z", llama.params.tfs_z }, + { "typical_p", llama.params.typical_p }, + { "repeat_last_n", llama.params.repeat_last_n }, + { "repeat_penalty", llama.params.repeat_penalty }, + { "presence_penalty", llama.params.presence_penalty }, + { "frequency_penalty", llama.params.frequency_penalty }, + { "mirostat", llama.params.mirostat }, + { "mirostat_tau", llama.params.mirostat_tau }, + { "mirostat_eta", llama.params.mirostat_eta }, + { "penalize_nl", llama.params.penalize_nl } + }; +} + int main(int argc, char **argv) { // own arguments required by this example @@ -616,23 +636,7 @@ int main(int argc, char **argv) json data = { {"content", llama.generated_text }, {"tokens_predicted", llama.num_tokens_predicted}, - {"seed", llama.params.seed}, - {"generation_settings", { - "temp", llama.params.temp, - "top_k", llama.params.top_k, - "top_p", llama.params.top_p, - "tfs_z", llama.params.tfs_z, - "typical_p", llama.params.typical_p, - "repeat_last_n", llama.params.repeat_last_n, - "repeat_penalty", llama.params.repeat_penalty, - "presence_penalty", llama.params.presence_penalty, - "frequency_penalty", llama.params.frequency_penalty, - "mirostat", llama.params.mirostat, - "mirostat_tau", llama.params.mirostat_tau, - "mirostat_eta", llama.params.mirostat_eta, - "penalize_nl", llama.params.penalize_nl - } - }, + {"generation_settings", format_generation_settings(llama) }, {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word} }; return res.set_content(data.dump(), "application/json"); @@ -702,23 +706,7 @@ int main(int argc, char **argv) {"content", result }, {"stop", true }, {"tokens_predicted", llama.num_tokens_predicted}, - {"seed", llama.params.seed}, - {"generation_settings", { - "temp", llama.params.temp, - "top_k", llama.params.top_k, - "top_p", llama.params.top_p, - "tfs_z", llama.params.tfs_z, - "typical_p", llama.params.typical_p, - "repeat_last_n", llama.params.repeat_last_n, - "repeat_penalty", llama.params.repeat_penalty, - "presence_penalty", llama.params.presence_penalty, - "frequency_penalty", llama.params.frequency_penalty, - "mirostat", llama.params.mirostat, - "mirostat_tau", llama.params.mirostat_tau, - "mirostat_eta", llama.params.mirostat_eta, - "penalize_nl", llama.params.penalize_nl - } - }, + {"generation_settings", format_generation_settings(llama) }, {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word}, {"generated_text", final_text} @@ -744,23 +732,7 @@ int main(int argc, char **argv) {"content", u8"\uFFFD" }, {"stop", true }, {"tokens_predicted", llama.num_tokens_predicted}, - {"seed", llama.params.seed}, - {"generation_settings", { - "temp", llama.params.temp, - "top_k", llama.params.top_k, - "top_p", llama.params.top_p, - "tfs_z", llama.params.tfs_z, - "typical_p", llama.params.typical_p, - "repeat_last_n", llama.params.repeat_last_n, - "repeat_penalty", llama.params.repeat_penalty, - "presence_penalty", llama.params.presence_penalty, - "frequency_penalty", llama.params.frequency_penalty, - "mirostat", llama.params.mirostat, - "mirostat_tau", llama.params.mirostat_tau, - "mirostat_eta", llama.params.mirostat_eta, - "penalize_nl", llama.params.penalize_nl - } - }, + {"generation_settings", format_generation_settings(llama) }, {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word}, {"generated_text", final_text} From 74c6f36bf1de18411b921dc18ef674e4a3acfad8 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 28 May 2023 19:19:34 +0300 Subject: [PATCH 023/121] Editorconfig suggested fixes delete whitespace --- examples/server/server.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e42c955ca9d9a..ee39c8789fe4d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -29,7 +29,6 @@ struct llama_server_context std::vector embd_inp; std::vector last_prompt_tokens; - llama_context *ctx; gpt_params params; @@ -249,7 +248,6 @@ struct llama_server_context if (token == -1) { return ""; } - if(as_loop) { generated_text = ""; } @@ -817,5 +815,4 @@ int main(int argc, char **argv) svr.set_read_timeout(sparams.read_timeout); svr.set_write_timeout(sparams.write_timeout); svr.listen(sparams.hostname, sparams.port); - } From 2c9ee7a05251d94f39c58bccdbb034eb5ecf49cc Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Sun, 28 May 2023 09:34:11 -0700 Subject: [PATCH 024/121] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Henri Vasserman --- examples/server/server.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ee39c8789fe4d..f849466892f3f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -32,7 +32,7 @@ struct llama_server_context llama_context *ctx; gpt_params params; - std::string stopping_word = ""; + std::string stopping_word; void rewind() { as_loop = false; @@ -255,7 +255,7 @@ struct llama_server_context std::string token_text = llama_token_to_str(ctx, token); generated_text += token_text; - for (std::string word : params.antiprompt) { + for (const std::string& word : params.antiprompt) { size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size())); if (i != std::string::npos) { generated_text.erase(generated_text.begin() + i, generated_text.begin() + i + word.size()); @@ -299,6 +299,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); + fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --embedding enable embedding mode\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); if (llama_mlock_supported()) @@ -637,7 +638,7 @@ int main(int argc, char **argv) llama_server_context llama; params.model = "ggml-model.bin"; - std::string final_text = ""; + std::string final_text; if (server_params_parse(argc, argv, sparams, params) == false) { From 655899db89f14c1e089a9d04d3423bcda86d96a8 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 13:49:45 -0400 Subject: [PATCH 025/121] Add ignore_eos option to generation settings. --- examples/server/server.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f849466892f3f..1b4e28083c34a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -585,6 +585,14 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.params.seed = time(NULL); } + if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) + { + llama.params.logit_bias[llama_token_eos()] = -INFINITY; + } + else + { + llama.params.logit_bias.erase(llama_token_eos()); + } if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); From b38d41ef521588d59826917a55c9e704868fa389 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 13:58:25 -0400 Subject: [PATCH 026/121] --memory_f32 flag to --memory-f32 to match common.cpp --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1b4e28083c34a..7f4e939f39348 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -298,7 +298,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); + fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n"); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --embedding enable embedding mode\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); @@ -386,7 +386,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } params.n_ctx = std::stoi(argv[i]); } - else if (arg == "--memory_f32") + else if (arg == "--memory-f32") { params.memory_f16 = false; } From 6c58f64a3bef5d1306729a9b0820084b9b074aa8 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 14:17:36 -0400 Subject: [PATCH 027/121] --ctx_size flag to --ctx-size to match common.cpp --- examples/server/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7f4e939f39348..703157b3f4451 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -298,6 +298,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n"); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --embedding enable embedding mode\n"); @@ -377,7 +378,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para server_print_usage(argc, argv, default_params, default_sparams); exit(0); } - else if (arg == "-c" || arg == "--ctx_size") + else if (arg == "-c" || arg == "--ctx-size") { if (++i >= argc) { From 33b69571778860c53af6bcdf963c58df9c154eb2 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Sun, 28 May 2023 16:45:05 -0400 Subject: [PATCH 028/121] Fixed failing to return result on stopping token. --- examples/server/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 703157b3f4451..96aa30ef777e0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -236,6 +236,7 @@ struct llama_server_context if (!embd.empty() && embd.back() == llama_token_eos()) { stopping_word = llama_token_to_str(ctx, embd.back()); has_next_token = false; + return result; } has_next_token = n_remain != 0; From 03ea8f013a65b6574333927b0b604052b6535b3a Mon Sep 17 00:00:00 2001 From: digiwombat Date: Tue, 30 May 2023 15:48:55 -0400 Subject: [PATCH 029/121] Fix for the regen issue. --- examples/server/server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6e38b29debd7d..7be83aa92fa77 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -66,6 +66,10 @@ struct llama_server_context bool loadPrompt() { params.prompt.insert(0, 1, ' '); // always add a first space std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); + if (prompt_tokens == last_prompt_tokens) + { + embd.clear(); + } // compare the evaluated prompt with the new prompt for (n_past = 0; n_past < prompt_tokens.size() - 1 && n_past < processed_tokens.size(); n_past++) { if (prompt_tokens[n_past] != processed_tokens[n_past]) { From d6fff56e22455c73efb381fbd03f50c97d3ef2db Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 30 May 2023 19:33:33 -0300 Subject: [PATCH 030/121] add streaming via server-sent events Removes /next-token endpoint and adds a "stream" parameter to the /completion one. --- examples/server/server.cpp | 207 ++++++++++++++----------------------- 1 file changed, 77 insertions(+), 130 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7be83aa92fa77..5e7d1c357bee7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -13,7 +13,7 @@ struct server_params struct llama_server_context { - bool as_loop = false; + bool stream = false; bool has_next_token = false; std::string generated_text = ""; @@ -35,7 +35,6 @@ struct llama_server_context std::string stopping_word; void rewind() { - as_loop = false; params.antiprompt.clear(); num_tokens_predicted = 0; generated_text = ""; @@ -253,9 +252,6 @@ struct llama_server_context if (token == -1) { return ""; } - if(as_loop) { - generated_text = ""; - } std::string token_text = llama_token_to_str(ctx, token); generated_text += token_text; @@ -270,7 +266,7 @@ struct llama_server_context } } - return generated_text; + return token_text; } std::vector embedding(std::string content, int threads) { @@ -478,9 +474,13 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para bool parse_options_completion(json body, llama_server_context& llama, Response &res) { gpt_params default_params; - if (!body["as_loop"].is_null()) + if (!body["stream"].is_null()) + { + llama.stream = body["stream"].get(); + } + else { - llama.as_loop = body["as_loop"].get(); + llama.stream = false; } if (!body["n_predict"].is_null()) { @@ -671,8 +671,6 @@ int main(int argc, char **argv) llama_server_context llama; params.model = "ggml-model.bin"; - std::string final_text; - if (server_params_parse(argc, argv, sparams, params) == false) { return 1; @@ -689,65 +687,80 @@ int main(int argc, char **argv) svr.Get("/", [](const Request &, Response &res) { res.set_content("

llama.cpp server works

", "text/html"); }); - svr.Post("/completion", [&llama, &final_text](const Request &req, Response &res) - { - if(llama.params.embedding) { - json data = { - {"status", "error"}, - {"reason", "To use completion function, disable embedding mode"}}; - res.set_content(data.dump(), "application/json"); - res.status = 400; - return; - } + svr.Post("/completion", [&llama](const Request &req, Response &res) { + if (llama.params.embedding) { + json data = { + {"status", "error"}, + {"reason", "To use completion function, disable embedding mode"}}; + res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), + "application/json"); + res.status = 400; + return; + } - llama.rewind(); - final_text = ""; + llama.rewind(); - if(parse_options_completion(json::parse(req.body), llama, res) == false){ - return; - } + if (parse_options_completion(json::parse(req.body), llama, res) == false) { + return; + } - if (!llama.loadPrompt()) - { - json data = { - {"status", "error"}, - {"reason", "Context too long."}}; - res.set_content(data.dump(), "application/json"); - res.status = 400; - return; + if (!llama.loadPrompt()) { + json data = {{"status", "error"}, {"reason", "Context too long."}}; + res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), + "application/json"); + res.status = 400; + return; + } + + llama.beginCompletion(); + + if (!llama.stream) { + while (llama.has_next_token) { + llama.doCompletion(); + } + + json data = {{"content", llama.generated_text}, + {"stop", true}, + {"model", llama.params.model_alias }, + {"tokens_predicted", llama.num_tokens_predicted}, + {"generation_settings", format_generation_settings(llama)}, + {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word}}; + return res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); + } else { + const auto chunked_content_provider = [&](size_t, DataSink &sink) { + while (llama.has_next_token) { + std::string token_text = llama.doCompletion(); + + json data; + if (llama.has_next_token) { + data = {{"content", token_text}, {"stop", false}}; + } else { + // Generation is done, send extra information. + data = { + {"content", token_text}, + {"stop", true}, + {"model", llama.params.model_alias}, + {"tokens_predicted", llama.num_tokens_predicted}, + {"generation_settings", format_generation_settings(llama)}, + {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word}, + {"generated_text", llama.generated_text}}; + } + + std::string str = + "data: " + + data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + sink.write(str.data(), str.size()); } - llama.beginCompletion(); - if(llama.as_loop) { - json data = { - {"status", "done" } }; - return res.set_content(data.dump(), "application/json"); - } else { - // loop inference until finish completion - while (llama.has_next_token) - { - llama.doCompletion(); - } - try - { - json data = { - {"model", llama.params.model_alias }, - {"content", llama.generated_text }, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word} }; - return res.set_content(data.dump(), "application/json"); - } - catch (const json::exception &e) - { - // Some tokens have bad UTF-8 strings, the json parser is very sensitive - json data = { - {"content", "Bad encoding token"}, - {"tokens_predicted", 0}}; - return res.set_content(data.dump(), "application/json"); - } - } }); + sink.done(); + return true; + }; + res.set_chunked_content_provider("text/event-stream", chunked_content_provider); + } + }); svr.Post("/tokenize", [&llama](const Request &req, Response &res) { @@ -774,72 +787,6 @@ int main(int argc, char **argv) return res.set_content(data.dump(), "application/json"); }); - svr.Get("/next-token", [&llama, &final_text](const Request &req, Response &res) - { - if(llama.params.embedding) { - res.set_content("{}", "application/json"); - return; - } - std::string result = ""; - if (req.has_param("stop")) { - llama.has_next_token = false; - } else { - result = llama.doCompletion(); // inference next token - } - try { - json data; - if (llama.has_next_token) - { - //fprintf(stdout, "Result: %s\n", result); - final_text += result; - data = { - {"content", result }, - {"stop", false } - }; - } - else - { - // Generation is done, send extra information. - data = { - {"content", result }, - {"stop", true }, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word}, - {"generated_text", final_text} - }; - } - - return res.set_content(data.dump(), "application/json"); - } catch (const json::exception &e) { - // Some tokens have bad UTF-8 strings, the json parser is very sensitive - json data; - if (llama.has_next_token) - { - final_text += u8"\uFFFD"; - data = { - {"content", result }, - {"stop", false } - }; - } - else - { - // Generation is done, send extra information. - data = { - {"content", u8"\uFFFD" }, - {"stop", true }, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word}, - {"generated_text", final_text} - }; - } - return res.set_content(data.dump(), "application/json"); - } - }); - fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port); if(params.embedding) { From 3292f057dc2165efde77a813ba65d64526580f4d Mon Sep 17 00:00:00 2001 From: digiwombat Date: Tue, 30 May 2023 19:44:16 -0400 Subject: [PATCH 031/121] Changed to single API endpoint for streaming and non. next-token endpoint removed. "as_loop" setting changed to "streaming" --- examples/server/server.cpp | 120 ++++++++++++++----------------------- 1 file changed, 45 insertions(+), 75 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7be83aa92fa77..5af5fbeafca32 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -13,7 +13,7 @@ struct server_params struct llama_server_context { - bool as_loop = false; + bool streaming = false; bool has_next_token = false; std::string generated_text = ""; @@ -35,7 +35,7 @@ struct llama_server_context std::string stopping_word; void rewind() { - as_loop = false; + streaming = false; params.antiprompt.clear(); num_tokens_predicted = 0; generated_text = ""; @@ -253,7 +253,7 @@ struct llama_server_context if (token == -1) { return ""; } - if(as_loop) { + if(streaming) { generated_text = ""; } @@ -478,9 +478,9 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para bool parse_options_completion(json body, llama_server_context& llama, Response &res) { gpt_params default_params; - if (!body["as_loop"].is_null()) + if (!body["streaming"].is_null()) { - llama.as_loop = body["as_loop"].get(); + llama.streaming = body["streaming"].get(); } if (!body["n_predict"].is_null()) { @@ -718,11 +718,46 @@ int main(int argc, char **argv) } llama.beginCompletion(); - if(llama.as_loop) { - json data = { - {"status", "done" } }; - return res.set_content(data.dump(), "application/json"); - } else { + if(llama.streaming) + { + fprintf(stdout, "In streaming\n"); + res.set_chunked_content_provider("text/event-stream", [&](size_t /*offset*/, + DataSink& sink) { + std::string final_text = ""; + // loop inference until finish completion + while (llama.has_next_token) { + std::string result = llama.doCompletion(); + json data; + final_text += result; + fprintf(stdout, "Result: %s\n", result); + if (llama.has_next_token) + { + data = { {"content", result}, {"stop", false} }; + } + else + { + // Generation is done, send extra information. + data = { {"content", result}, + {"stop", true}, + {"tokens_predicted", llama.num_tokens_predicted}, + {"generation_settings", format_generation_settings(llama)}, + {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word}, + {"generated_text", final_text} }; + } + + std::string str = + "data: " + data.dump(4, ' ', false, json::error_handler_t::replace) + + "\n\n"; + sink.write(str.data(), str.size()); + } + + sink.done(); + return true; + }); + } + else + { // loop inference until finish completion while (llama.has_next_token) { @@ -774,71 +809,6 @@ int main(int argc, char **argv) return res.set_content(data.dump(), "application/json"); }); - svr.Get("/next-token", [&llama, &final_text](const Request &req, Response &res) - { - if(llama.params.embedding) { - res.set_content("{}", "application/json"); - return; - } - std::string result = ""; - if (req.has_param("stop")) { - llama.has_next_token = false; - } else { - result = llama.doCompletion(); // inference next token - } - try { - json data; - if (llama.has_next_token) - { - //fprintf(stdout, "Result: %s\n", result); - final_text += result; - data = { - {"content", result }, - {"stop", false } - }; - } - else - { - // Generation is done, send extra information. - data = { - {"content", result }, - {"stop", true }, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word}, - {"generated_text", final_text} - }; - } - - return res.set_content(data.dump(), "application/json"); - } catch (const json::exception &e) { - // Some tokens have bad UTF-8 strings, the json parser is very sensitive - json data; - if (llama.has_next_token) - { - final_text += u8"\uFFFD"; - data = { - {"content", result }, - {"stop", false } - }; - } - else - { - // Generation is done, send extra information. - data = { - {"content", u8"\uFFFD" }, - {"stop", true }, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word}, - {"generated_text", final_text} - }; - } - return res.set_content(data.dump(), "application/json"); - } - }); fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port); From 38eaf2b7f790d8851e8fa8136ab245ab8a781422 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Tue, 30 May 2023 19:48:43 -0400 Subject: [PATCH 032/121] Removed testing fprintf calls. --- examples/server/server.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5af5fbeafca32..d10e416b60147 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -720,7 +720,6 @@ int main(int argc, char **argv) llama.beginCompletion(); if(llama.streaming) { - fprintf(stdout, "In streaming\n"); res.set_chunked_content_provider("text/event-stream", [&](size_t /*offset*/, DataSink& sink) { std::string final_text = ""; @@ -729,7 +728,6 @@ int main(int argc, char **argv) std::string result = llama.doCompletion(); json data; final_text += result; - fprintf(stdout, "Result: %s\n", result); if (llama.has_next_token) { data = { {"content", result}, {"stop", false} }; From a25f830fe1b7d0cec7fd57fbfc3dba67e4c7906b Mon Sep 17 00:00:00 2001 From: digiwombat Date: Tue, 30 May 2023 20:17:18 -0400 Subject: [PATCH 033/121] Default streaming to false if it's not set in the request body. --- examples/server/server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d10e416b60147..095ae4bc38bc4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -482,6 +482,10 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.streaming = body["streaming"].get(); } + else + { + llama.streaming = false; + } if (!body["n_predict"].is_null()) { llama.params.n_predict = body["n_predict"].get(); From 7a853dc56d71fba64172f811e9e27a917922a832 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 30 May 2023 21:39:30 -0300 Subject: [PATCH 034/121] prevent the server from swallowing exceptions in debug mode So it's easier to catch them inside a debugger. --- examples/server/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index bd65c84b15f8c..b38fa864a5c9b 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,6 +1,11 @@ set(TARGET server) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) +target_compile_definitions(${TARGET} PRIVATE + $<$: + CPPHTTPLIB_NO_EXCEPTIONS=1 + > +) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) From aa0788b65056151957ed14c0d2fce0687a0ba1b2 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 30 May 2023 21:41:55 -0300 Subject: [PATCH 035/121] add --verbose flag and request logging --- examples/server/server.cpp | 156 +++++++++++++++++++++++++++++++++---- 1 file changed, 142 insertions(+), 14 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index bcb7cd4b93671..a4bb38244f3e9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -9,6 +9,7 @@ struct server_params int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; + bool verbose = false; }; struct llama_server_context @@ -34,6 +35,9 @@ struct llama_server_context std::string stopping_word; + bool verbose = false; + int json_indent = -1; + void rewind() { params.antiprompt.clear(); num_tokens_predicted = 0; @@ -239,6 +243,9 @@ struct llama_server_context if (!embd.empty() && embd.back() == llama_token_eos()) { stopping_word = llama_token_to_str(ctx, embd.back()); has_next_token = false; + if (verbose) { + fprintf(stderr, "eos token found!\n"); + } return result; } @@ -256,6 +263,20 @@ struct llama_server_context std::string token_text = llama_token_to_str(ctx, token); generated_text += token_text; + if (verbose) { + fprintf(stderr, + "next token: {\n" + " token: %d,\n" + " token_text: \"%s\",\n" + " has_next_token: %d,\n" + " n_remain: %ld,\n" + " num_tokens_predicted: %ld,\n" + " stopping_word: \"%s\",\n" + "}\n", + token, token_text.c_str(), has_next_token, n_remain, num_tokens_predicted, + stopping_word.c_str()); + } + for (const std::string& word : params.antiprompt) { size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size())); if (i != std::string::npos) { @@ -298,6 +319,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -v, --verbose verbose output (default: false)\n"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); @@ -454,6 +476,8 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para break; } params.lora_base = argv[i]; + } else if (arg == "-v" || arg == "--verbose") { + sparams.verbose = true; } else { @@ -627,7 +651,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & json data = { {"status", "error"}, {"reason", "You need to pass the prompt"}}; - res.set_content(data.dump(), "application/json"); + res.set_content(data.dump(llama.json_indent), "application/json"); res.status = 400; return false; } @@ -639,6 +663,45 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.params.antiprompt.clear(); } + + if (llama.verbose) { + std::string tmp_stop = + std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(), + std::string{}, [](std::string a, std::string b) { + return a + (a != "" ? ", \"" : "") + b + "\""; + }); + + fprintf(stderr, + "-------------------------\n" + "/completion parameters: {\n" + " stream: %d,\n" + " frequency_penalty: %f,\n" + " mirostat: %d,\n" + " mirostat_eta: %f,\n" + " mirostat_tau: %f,\n" + " n_keep: %d,\n" + " n_predict: %d,\n" + " penalize_nl: %d,\n" + " presence_penalty: %f,\n" + " repeat_last_n: %d,\n" + " repeat_penalty: %f,\n" + " seed: %d,\n" + " stop: [%s],\n" + " temperature: %f,\n" + " tfs_z: %f,\n" + " top_k: %d,\n" + " top_p: %f,\n" + " typical_p: %f,\n" + "}\nPROMPT[%s]\n", + llama.stream, llama.params.frequency_penalty, llama.params.mirostat, + llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep, + llama.params.n_predict, llama.params.penalize_nl, + llama.params.presence_penalty, llama.params.repeat_last_n, + llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(), + llama.params.temp, llama.params.tfs_z, llama.params.top_k, + llama.params.top_p, llama.params.typical_p, llama.params.prompt.c_str()); + } + return true; } @@ -661,6 +724,44 @@ json format_generation_settings(const llama_server_context& llama) { }; } +std::string log(const Request &req, const Response &res) +{ + std::string s; + + s += "============ REQUEST ===========\n"; + s += "< "; + s += req.method; + s += " "; + s += req.path; + s += " "; + s += req.version; + s += "\n"; + + if (!req.body.empty()) { + std::string line; + std::istringstream stream(req.body); + while (std::getline(stream, line)) { + s += "< " + line + "\n"; + } + } + + s += "------------ RESPONSE ------------\n> "; + s += res.version; + s += " "; + s += std::to_string(res.status); + s += "\n"; + + if (!res.body.empty()) { + std::string line; + std::istringstream stream(res.body); + while (std::getline(stream, line)) { + s += "> " + line + "\n"; + } + } + + return s; +} + int main(int argc, char **argv) { // own arguments required by this example @@ -676,6 +777,9 @@ int main(int argc, char **argv) return 1; } + llama.verbose = sparams.verbose; + llama.json_indent = sparams.verbose ? 4 : -1; + // load the model if (!llama.loadModel(params)) { @@ -692,8 +796,9 @@ int main(int argc, char **argv) json data = { {"status", "error"}, {"reason", "To use completion function, disable embedding mode"}}; - res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), - "application/json"); + res.set_content( + data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), + "application/json"); res.status = 400; return; } @@ -706,8 +811,9 @@ int main(int argc, char **argv) if (!llama.loadPrompt()) { json data = {{"status", "error"}, {"reason", "Context too long."}}; - res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), - "application/json"); + res.set_content( + data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), + "application/json"); res.status = 400; return; } @@ -721,12 +827,14 @@ int main(int argc, char **argv) json data = {{"content", llama.generated_text}, {"stop", true}, - {"model", llama.params.model_alias }, + {"model", llama.params.model_alias}, {"tokens_predicted", llama.num_tokens_predicted}, {"generation_settings", format_generation_settings(llama)}, {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word}}; - return res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); + return res.set_content( + data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), + "application/json"); } else { const auto chunked_content_provider = [&](size_t, DataSink &sink) { while (llama.has_next_token) { @@ -748,10 +856,10 @@ int main(int argc, char **argv) {"generated_text", llama.generated_text}}; } - std::string str = - "data: " + - data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; + std::string str = "data: " + + data.dump(llama.json_indent, ' ', false, + json::error_handler_t::replace) + + "\n\n"; sink.write(str.data(), str.size()); } @@ -768,7 +876,7 @@ int main(int argc, char **argv) json body = json::parse(req.body); json data = { {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(llama.json_indent), "application/json"); }); svr.Post("/embedding", [&llama](const Request &req, Response &res) @@ -778,14 +886,14 @@ int main(int argc, char **argv) json data = { {"embedding", empty}}; fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n"); - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(llama.json_indent), "application/json"); } json body = json::parse(req.body); std::string content = body["content"].get(); int threads = body["threads"].get(); json data = { {"embedding", llama.embedding(content, threads) } }; - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(llama.json_indent), "application/json"); }); fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port); @@ -794,6 +902,26 @@ int main(int argc, char **argv) fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n"); } + if (llama.verbose) { + svr.set_logger([](const Request &req, const Response &res) { + fprintf(stderr, "%s", log(req, res).c_str()); + }); + } + + svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { + auto fmt = "500 Internal Server Error\n%s"; + char buf[BUFSIZ]; + try { + std::rethrow_exception(ep); + } catch (std::exception &e) { + snprintf(buf, sizeof(buf), fmt, e.what()); + } catch (...) { + snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); + } + res.set_content(buf, "text/plain"); + res.status = 500; + }); + // set timeouts and change hostname and port svr.set_read_timeout(sparams.read_timeout); svr.set_write_timeout(sparams.write_timeout); From b6f536dfb37051b677735303845e3eaceb354a01 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Tue, 30 May 2023 21:14:24 -0400 Subject: [PATCH 036/121] Cull to end of generated_text when encountering a stopping string in case it's a partial token. Will roll this back if it proves to be a problem. --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a4bb38244f3e9..3d9cd5ca11809 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -280,7 +280,7 @@ struct llama_server_context for (const std::string& word : params.antiprompt) { size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size())); if (i != std::string::npos) { - generated_text.erase(generated_text.begin() + i, generated_text.begin() + i + word.size()); + generated_text.erase(generated_text.begin() + i, generated_text.end()); stopping_word = word; has_next_token = false; break; From 7a8104fbd26326f4d4c618a027942e62efc8f1bb Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 30 May 2023 23:11:32 -0300 Subject: [PATCH 037/121] add missing quote when printing stopping strings --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3d9cd5ca11809..8d07f39d7480d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -668,7 +668,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & std::string tmp_stop = std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(), std::string{}, [](std::string a, std::string b) { - return a + (a != "" ? ", \"" : "") + b + "\""; + return a + (a != "" ? ", \"" : "\"") + b + "\""; }); fprintf(stderr, From 3a079d5cc895620de11bf64c1448b189dbaa0954 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 30 May 2023 23:12:00 -0300 Subject: [PATCH 038/121] stop generating when the stream is closed --- examples/server/server.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8d07f39d7480d..ad46f56e9e58d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -860,7 +860,12 @@ int main(int argc, char **argv) data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace) + "\n\n"; - sink.write(str.data(), str.size()); + if (!sink.write(str.data(), str.size())) { + if (llama.verbose) { + fprintf(stderr, "stream closed\n"); + } + return false; + } } sink.done(); From c1cbde82a12d59a0ee8ae2ae6025c99f18c1e526 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 00:00:56 -0300 Subject: [PATCH 039/121] print error when server can't bind to the interface --- examples/server/server.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ad46f56e9e58d..5c16628659aed 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -901,8 +901,6 @@ int main(int argc, char **argv) return res.set_content(data.dump(llama.json_indent), "application/json"); }); - fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port); - if(params.embedding) { fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n"); } @@ -930,5 +928,16 @@ int main(int argc, char **argv) // set timeouts and change hostname and port svr.set_read_timeout(sparams.read_timeout); svr.set_write_timeout(sparams.write_timeout); - svr.listen(sparams.hostname, sparams.port); + + if (!svr.bind_to_port(sparams.hostname, sparams.port)) { + fprintf(stderr, "%s: ERROR: couldn't bind server to %s:%i\n", __func__, + sparams.hostname.c_str(), sparams.port); + return 1; + } + + fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, + sparams.hostname.c_str(), sparams.port); + if (!svr.listen_after_bind()) { + return 1; + } } From 2c08f29691d6a69bb1c26db2a239e8a8124c313d Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 09:02:32 -0300 Subject: [PATCH 040/121] make api server use only a single thread --- examples/server/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index b38fa864a5c9b..67b0867545574 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -2,6 +2,9 @@ set(TARGET server) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) target_compile_definitions(${TARGET} PRIVATE + # single thread + CPPHTTPLIB_THREAD_POOL_COUNT=1 + # crash the server in the debug mode, otherwise send http 500 error $<$: CPPHTTPLIB_NO_EXCEPTIONS=1 > From 284bc293b1e003659416e776d1b9528ebca38d10 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:46:06 -0300 Subject: [PATCH 041/121] reserve memory for generated_text --- examples/server/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5c16628659aed..b4233322867fe 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -42,6 +42,7 @@ struct llama_server_context params.antiprompt.clear(); num_tokens_predicted = 0; generated_text = ""; + generated_text.reserve(params.n_ctx); stopping_word = ""; //processed_tokens.clear(); From f1710b90dcd4fb47a170e8e05faceb26ed594580 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:35:25 -0300 Subject: [PATCH 042/121] add infinite generation when n_predict is -1 --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b4233322867fe..b0f0486b76a8b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -250,7 +250,7 @@ struct llama_server_context return result; } - has_next_token = n_remain != 0; + has_next_token = params.n_predict == -1 ? true : n_remain != 0; return result; } From aa2bbb2d357617907278b5102abbae49bab2236a Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:36:51 -0300 Subject: [PATCH 043/121] fix parameter type --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b0f0486b76a8b..37b5b78d3a33b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -52,7 +52,7 @@ struct llama_server_context n_consumed = 0; } - bool loadModel(gpt_params params_) + bool loadModel(const gpt_params ¶ms_) { params = params_; ctx = llama_init_from_gpt_params(params); From 27911d6d68d465dc944af508aeb284288019eb3b Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:37:52 -0300 Subject: [PATCH 044/121] fix default model alias --- examples/server/server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 37b5b78d3a33b..fbfcc6b7f0998 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -781,6 +781,10 @@ int main(int argc, char **argv) llama.verbose = sparams.verbose; llama.json_indent = sparams.verbose ? 4 : -1; + if (params.model_alias == "unknown") { + params.model_alias = params.model; + } + // load the model if (!llama.loadModel(params)) { From dd3021933232573bfdde2cb249c22ab332d353f3 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:40:42 -0300 Subject: [PATCH 045/121] buffer incomplete multi-byte characters --- examples/server/server.cpp | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fbfcc6b7f0998..b78992a13da67 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -842,16 +842,49 @@ int main(int argc, char **argv) "application/json"); } else { const auto chunked_content_provider = [&](size_t, DataSink &sink) { + size_t sent_count = 0; + int32_t multibyte_pending = 0; + while (llama.has_next_token) { std::string token_text = llama.doCompletion(); + if (multibyte_pending > 0) { + multibyte_pending -= token_text.size(); + } else if (token_text.size() == 1) { + const char c = token_text[0]; + // 2-byte characters: 110xxxxx 10xxxxxx + if ((c & 0xE0) == 0xC0) { + multibyte_pending = 1; + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF0) == 0xE0) { + multibyte_pending = 2; + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF8) == 0xF0) { + multibyte_pending = 3; + } else { + multibyte_pending = 0; + } + } + + if (multibyte_pending > 0) { + if (!llama.has_next_token) { + llama.has_next_token = true; + llama.n_remain++; + } + continue; + } + + const size_t pos = std::min(sent_count, llama.generated_text.size()); + std::string to_send = llama.generated_text.substr(pos); + sent_count += to_send.size(); + json data; if (llama.has_next_token) { - data = {{"content", token_text}, {"stop", false}}; + data = {{"content", to_send}, {"stop", false}}; } else { // Generation is done, send extra information. data = { - {"content", token_text}, + {"content", to_send}, {"stop", true}, {"model", llama.params.model_alias}, {"tokens_predicted", llama.num_tokens_predicted}, From 40e13805d983c93598249c2673ba9fc4e8f1dc0d Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:41:47 -0300 Subject: [PATCH 046/121] print timings + build info I don't know if llama_free is needed but it was used in main.cpp. --- examples/server/server.cpp | 49 ++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b78992a13da67..acccbc9d7ca4b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,7 +1,9 @@ -#include -#include #include "common.h" #include "llama.h" +#include "build-info.h" + +#include +#include struct server_params { @@ -30,7 +32,7 @@ struct llama_server_context std::vector embd_inp; std::vector last_prompt_tokens; - llama_context *ctx; + llama_context *ctx = nullptr; gpt_params params; std::string stopping_word; @@ -38,6 +40,14 @@ struct llama_server_context bool verbose = false; int json_indent = -1; + ~llama_server_context() + { + if (ctx) { + llama_free(ctx); + ctx = nullptr; + } + } + void rewind() { params.antiprompt.clear(); num_tokens_predicted = 0; @@ -765,6 +775,8 @@ std::string log(const Request &req, const Response &res) int main(int argc, char **argv) { + llama_init_backend(); + // own arguments required by this example gpt_params params; server_params sparams; @@ -785,6 +797,10 @@ int main(int argc, char **argv) params.model_alias = params.model; } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, + std::thread::hardware_concurrency(), llama_print_system_info()); + // load the model if (!llama.loadModel(params)) { @@ -809,6 +825,7 @@ int main(int argc, char **argv) } llama.rewind(); + llama_reset_timings(llama.ctx); if (parse_options_completion(json::parse(req.body), llama, res) == false) { return; @@ -837,6 +854,11 @@ int main(int argc, char **argv) {"generation_settings", format_generation_settings(llama)}, {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word}}; + + if (llama.verbose) { + llama_print_timings(llama.ctx); + } + return res.set_content( data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), "application/json"); @@ -894,18 +916,29 @@ int main(int argc, char **argv) {"generated_text", llama.generated_text}}; } - std::string str = "data: " + - data.dump(llama.json_indent, ' ', false, - json::error_handler_t::replace) + - "\n\n"; + std::string str = + "data: " + + data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false, + json::error_handler_t::replace) + + "\n\n"; + + if (llama.verbose) { + fprintf(stderr, "to_send=%s", str.c_str()); + } + if (!sink.write(str.data(), str.size())) { if (llama.verbose) { fprintf(stderr, "stream closed\n"); + llama_print_timings(llama.ctx); } return false; } } + if (llama.verbose) { + llama_print_timings(llama.ctx); + } + sink.done(); return true; }; @@ -978,4 +1011,6 @@ int main(int argc, char **argv) if (!svr.listen_after_bind()) { return 1; } + + return 0; } From d58e48663d119d439abbd388390f7101dec3bbe5 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 11:56:12 -0300 Subject: [PATCH 047/121] default penalize_nl to false + format --- examples/server/server.cpp | 321 +++++++++++++++---------------------- 1 file changed, 131 insertions(+), 190 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index acccbc9d7ca4b..eb75ab1de23b7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -507,210 +507,151 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para return true; } -bool parse_options_completion(json body, llama_server_context& llama, Response &res) { +bool parse_options_completion(json body, llama_server_context& llama, Response &res) +{ gpt_params default_params; - if (!body["stream"].is_null()) - { - llama.stream = body["stream"].get(); - } - else - { - llama.stream = false; - } - if (!body["n_predict"].is_null()) - { - llama.params.n_predict = body["n_predict"].get(); - } - else - { - llama.params.n_predict = default_params.n_predict; - } - if (!body["top_k"].is_null()) - { - llama.params.top_k = body["top_k"].get(); - } - else - { - llama.params.top_k = default_params.top_k; - } - if (!body["top_p"].is_null()) - { - llama.params.top_p = body["top_p"].get(); - } - else - { - llama.params.top_p = default_params.top_p; - } - if (!body["tfs_z"].is_null()) - { - llama.params.tfs_z = body["tfs_z"].get(); - } - else - { - llama.params.tfs_z = default_params.tfs_z; - } - if (!body["typical_p"].is_null()) - { - llama.params.typical_p = body["typical_p"].get(); - } - else - { - llama.params.typical_p = default_params.typical_p; - } - if (!body["repeat_last_n"].is_null()) - { - llama.params.repeat_last_n = body["repeat_last_n"].get(); - } - else - { - llama.params.repeat_last_n = default_params.repeat_last_n; - } - if (!body["temperature"].is_null()) - { - llama.params.temp = body["temperature"].get(); - } - else - { - llama.params.temp = default_params.temp; - } - if (!body["repeat_penalty"].is_null()) - { - llama.params.repeat_penalty = body["repeat_penalty"].get(); - } - else - { - llama.params.repeat_penalty = default_params.repeat_penalty; - } - if (!body["presence_penalty"].is_null()) - { - llama.params.presence_penalty = body["presence_penalty"].get(); - } - else - { - llama.params.presence_penalty = default_params.presence_penalty; - } - if (!body["frequency_penalty"].is_null()) - { - llama.params.frequency_penalty = body["frequency_penalty"].get(); - } - else - { - llama.params.frequency_penalty = default_params.frequency_penalty; - } - if (!body["mirostat"].is_null()) - { - llama.params.mirostat = body["mirostat"].get(); - } - else - { - llama.params.mirostat = default_params.mirostat; - } - if (!body["mirostat_tau"].is_null()) - { - llama.params.mirostat_tau = body["mirostat_tau"].get(); - } - else - { - llama.params.mirostat_tau = default_params.mirostat_tau; - } - if (!body["mirostat_eta"].is_null()) - { - llama.params.mirostat_eta = body["mirostat_eta"].get(); - } - else - { - llama.params.mirostat_eta = default_params.mirostat_eta; - } - if (!body["penalize_nl"].is_null()) - { - llama.params.penalize_nl = body["penalize_nl"].get(); - } - else - { - llama.params.penalize_nl = default_params.penalize_nl; - } - if (!body["n_keep"].is_null()) - { - llama.params.n_keep = body["n_keep"].get(); - } - else - { - llama.params.n_keep = default_params.n_keep; - } - if (!body["seed"].is_null()) - { + if (!body["stream"].is_null()) { + llama.stream = body["stream"].get(); + } else { + llama.stream = false; + } + if (!body["n_predict"].is_null()) { + llama.params.n_predict = body["n_predict"].get(); + } else { + llama.params.n_predict = default_params.n_predict; + } + if (!body["top_k"].is_null()) { + llama.params.top_k = body["top_k"].get(); + } else { + llama.params.top_k = default_params.top_k; + } + if (!body["top_p"].is_null()) { + llama.params.top_p = body["top_p"].get(); + } else { + llama.params.top_p = default_params.top_p; + } + if (!body["tfs_z"].is_null()) { + llama.params.tfs_z = body["tfs_z"].get(); + } else { + llama.params.tfs_z = default_params.tfs_z; + } + if (!body["typical_p"].is_null()) { + llama.params.typical_p = body["typical_p"].get(); + } else { + llama.params.typical_p = default_params.typical_p; + } + if (!body["repeat_last_n"].is_null()) { + llama.params.repeat_last_n = body["repeat_last_n"].get(); + } else { + llama.params.repeat_last_n = default_params.repeat_last_n; + } + if (!body["temperature"].is_null()) { + llama.params.temp = body["temperature"].get(); + } else { + llama.params.temp = default_params.temp; + } + if (!body["repeat_penalty"].is_null()) { + llama.params.repeat_penalty = body["repeat_penalty"].get(); + } else { + llama.params.repeat_penalty = default_params.repeat_penalty; + } + if (!body["presence_penalty"].is_null()) { + llama.params.presence_penalty = body["presence_penalty"].get(); + } else { + llama.params.presence_penalty = default_params.presence_penalty; + } + if (!body["frequency_penalty"].is_null()) { + llama.params.frequency_penalty = body["frequency_penalty"].get(); + } else { + llama.params.frequency_penalty = default_params.frequency_penalty; + } + if (!body["mirostat"].is_null()) { + llama.params.mirostat = body["mirostat"].get(); + } else { + llama.params.mirostat = default_params.mirostat; + } + if (!body["mirostat_tau"].is_null()) { + llama.params.mirostat_tau = body["mirostat_tau"].get(); + } else { + llama.params.mirostat_tau = default_params.mirostat_tau; + } + if (!body["mirostat_eta"].is_null()) { + llama.params.mirostat_eta = body["mirostat_eta"].get(); + } else { + llama.params.mirostat_eta = default_params.mirostat_eta; + } + if (!body["penalize_nl"].is_null()) { + llama.params.penalize_nl = body["penalize_nl"].get(); + } else { + llama.params.penalize_nl = false; + } + if (!body["n_keep"].is_null()) { + llama.params.n_keep = body["n_keep"].get(); + } else { + llama.params.n_keep = default_params.n_keep; + } + if (!body["seed"].is_null()) { llama.params.seed = body["seed"].get(); - } - else - { + } else { llama.params.seed = time(NULL); } - if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) - { - llama.params.logit_bias[llama_token_eos()] = -INFINITY; + if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) { + llama.params.logit_bias[llama_token_eos()] = -INFINITY; + } else { + llama.params.logit_bias.erase(llama_token_eos()); } - else - { - llama.params.logit_bias.erase(llama_token_eos()); - } - if (!body["prompt"].is_null()) - { + if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); - } - else - { - json data = { - {"status", "error"}, - {"reason", "You need to pass the prompt"}}; + } else { + json data = {{"status", "error"}, {"reason", "You need to pass the prompt"}}; res.set_content(data.dump(llama.json_indent), "application/json"); res.status = 400; return false; } - if (!body["stop"].is_null()) - { + if (!body["stop"].is_null()) { llama.params.antiprompt = body["stop"].get>(); - } - else - { - llama.params.antiprompt.clear(); + } else { + llama.params.antiprompt.clear(); } if (llama.verbose) { - std::string tmp_stop = - std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(), - std::string{}, [](std::string a, std::string b) { - return a + (a != "" ? ", \"" : "\"") + b + "\""; - }); - - fprintf(stderr, - "-------------------------\n" - "/completion parameters: {\n" - " stream: %d,\n" - " frequency_penalty: %f,\n" - " mirostat: %d,\n" - " mirostat_eta: %f,\n" - " mirostat_tau: %f,\n" - " n_keep: %d,\n" - " n_predict: %d,\n" - " penalize_nl: %d,\n" - " presence_penalty: %f,\n" - " repeat_last_n: %d,\n" - " repeat_penalty: %f,\n" - " seed: %d,\n" - " stop: [%s],\n" - " temperature: %f,\n" - " tfs_z: %f,\n" - " top_k: %d,\n" - " top_p: %f,\n" - " typical_p: %f,\n" - "}\nPROMPT[%s]\n", - llama.stream, llama.params.frequency_penalty, llama.params.mirostat, - llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep, - llama.params.n_predict, llama.params.penalize_nl, - llama.params.presence_penalty, llama.params.repeat_last_n, - llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(), - llama.params.temp, llama.params.tfs_z, llama.params.top_k, - llama.params.top_p, llama.params.typical_p, llama.params.prompt.c_str()); + std::string tmp_stop = + std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(), + std::string{}, [](std::string a, std::string b) { + return a + (a != "" ? ", \"" : "\"") + b + "\""; + }); + + fprintf(stderr, + "-------------------------\n" + "/completion parameters: {\n" + " stream: %d,\n" + " ignore_eos: %d,\n" + " frequency_penalty: %f,\n" + " mirostat: %d,\n" + " mirostat_eta: %f,\n" + " mirostat_tau: %f,\n" + " n_keep: %d,\n" + " n_predict: %d,\n" + " penalize_nl: %d,\n" + " presence_penalty: %f,\n" + " repeat_last_n: %d,\n" + " repeat_penalty: %f,\n" + " seed: %d,\n" + " stop: [%s],\n" + " temperature: %f,\n" + " tfs_z: %f,\n" + " top_k: %d,\n" + " top_p: %f,\n" + " typical_p: %f,\n" + "}\nPROMPT[%s]\n", + llama.stream, -INFINITY == llama.params.logit_bias[llama_token_eos()], + llama.params.frequency_penalty, llama.params.mirostat, + llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep, + llama.params.n_predict, llama.params.penalize_nl, + llama.params.presence_penalty, llama.params.repeat_last_n, + llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(), + llama.params.temp, llama.params.tfs_z, llama.params.top_k, llama.params.top_p, + llama.params.typical_p, llama.params.prompt.c_str()); } return true; From 3edaf6bd8bdc853f7f0a10f9e397bd01d0e99238 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 12:55:19 -0300 Subject: [PATCH 048/121] print timings by default --- examples/server/server.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eb75ab1de23b7..d6fb84cd978e1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -796,9 +796,7 @@ int main(int argc, char **argv) {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word}}; - if (llama.verbose) { - llama_print_timings(llama.ctx); - } + llama_print_timings(llama.ctx); return res.set_content( data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), @@ -870,16 +868,13 @@ int main(int argc, char **argv) if (!sink.write(str.data(), str.size())) { if (llama.verbose) { fprintf(stderr, "stream closed\n"); - llama_print_timings(llama.ctx); } + llama_print_timings(llama.ctx); return false; } } - if (llama.verbose) { - llama_print_timings(llama.ctx); - } - + llama_print_timings(llama.ctx); sink.done(); return true; }; From 7332b41f9f364eabfaed4ce6f39a8a2e7af2491f Mon Sep 17 00:00:00 2001 From: digiwombat Date: Wed, 31 May 2023 15:56:27 -0400 Subject: [PATCH 049/121] Simple single-line server log for requests --- examples/server/server.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d6fb84cd978e1..7d7d508f0e2cb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -750,8 +750,11 @@ int main(int argc, char **argv) Server svr; - svr.Get("/", [](const Request &, Response &res) - { res.set_content("

llama.cpp server works

", "text/html"); }); + svr.Get("/", [](const Request &req, Response &res) + { + fprintf(stderr, "request: GET / [remote_addr: %s]", req.remote_addr.c_str()); + res.set_content("

llama.cpp server works

", "text/html"); + }); svr.Post("/completion", [&llama](const Request &req, Response &res) { if (llama.params.embedding) { @@ -772,6 +775,8 @@ int main(int argc, char **argv) return; } + fprintf(stderr, "request: POST /completion [remote_addr: %s, stream: %s]", req.remote_addr.c_str(), llama.stream ? "true" : "false"); + if (!llama.loadPrompt()) { json data = {{"status", "error"}, {"reason", "Context too long."}}; res.set_content( @@ -885,6 +890,7 @@ int main(int argc, char **argv) svr.Post("/tokenize", [&llama](const Request &req, Response &res) { + fprintf(stderr, "request: POST /tokenize [remote_addr: %s]", req.remote_addr.c_str()); json body = json::parse(req.body); json data = { {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; @@ -893,6 +899,7 @@ int main(int argc, char **argv) svr.Post("/embedding", [&llama](const Request &req, Response &res) { + fprintf(stderr, "request: POST /embedding [remote_addr: %s]", req.remote_addr.c_str()); if(!llama.params.embedding) { std::vector empty; json data = { From dda4c10d64963b054a9f69dd6c7bac3a1bb32c23 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Wed, 31 May 2023 16:23:39 -0400 Subject: [PATCH 050/121] Switch to the CPPHTTPLIB logger. Verbose adds body dump as well as request info. --- examples/server/server.cpp | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7d7d508f0e2cb..922ea942148ad 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -750,11 +750,8 @@ int main(int argc, char **argv) Server svr; - svr.Get("/", [](const Request &req, Response &res) - { - fprintf(stderr, "request: GET / [remote_addr: %s]", req.remote_addr.c_str()); - res.set_content("

llama.cpp server works

", "text/html"); - }); + svr.Get("/", [](const Request &, Response &res) + { res.set_content("

llama.cpp server works

", "text/html"); }); svr.Post("/completion", [&llama](const Request &req, Response &res) { if (llama.params.embedding) { @@ -775,8 +772,6 @@ int main(int argc, char **argv) return; } - fprintf(stderr, "request: POST /completion [remote_addr: %s, stream: %s]", req.remote_addr.c_str(), llama.stream ? "true" : "false"); - if (!llama.loadPrompt()) { json data = {{"status", "error"}, {"reason", "Context too long."}}; res.set_content( @@ -890,7 +885,6 @@ int main(int argc, char **argv) svr.Post("/tokenize", [&llama](const Request &req, Response &res) { - fprintf(stderr, "request: POST /tokenize [remote_addr: %s]", req.remote_addr.c_str()); json body = json::parse(req.body); json data = { {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; @@ -899,7 +893,6 @@ int main(int argc, char **argv) svr.Post("/embedding", [&llama](const Request &req, Response &res) { - fprintf(stderr, "request: POST /embedding [remote_addr: %s]", req.remote_addr.c_str()); if(!llama.params.embedding) { std::vector empty; json data = { @@ -920,8 +913,22 @@ int main(int argc, char **argv) } if (llama.verbose) { - svr.set_logger([](const Request &req, const Response &res) { - fprintf(stderr, "%s", log(req, res).c_str()); + svr.set_logger([](const Request& req, const Response& res) { + json log = { + { "status", res.status }, + { "request", req.body }, + { "response", res.body }, + }; + fprintf(stdout, "http_request: request: %s %s \nhttp_request: log: %s\n", req.method.c_str(), req.path.c_str(), log.dump().c_str()); + }); + } else { + svr.set_logger([](const Request& req, const Response& res) { + json log = { + { "status", res.status }, + { "request", req.body }, + { "response", res.body }, + }; + fprintf(stdout, "http_request: request: %s %s \n", req.method.c_str(), req.path.c_str()); }); } From 86337e3a9bb97446342b93c0f55ad6c650bce3bd Mon Sep 17 00:00:00 2001 From: digiwombat Date: Wed, 31 May 2023 16:41:34 -0400 Subject: [PATCH 051/121] Server console logs now come in one flavor: Verbose. --- examples/server/server.cpp | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 922ea942148ad..8436db10ee887 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -912,25 +912,14 @@ int main(int argc, char **argv) fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n"); } - if (llama.verbose) { - svr.set_logger([](const Request& req, const Response& res) { - json log = { - { "status", res.status }, - { "request", req.body }, - { "response", res.body }, - }; - fprintf(stdout, "http_request: request: %s %s \nhttp_request: log: %s\n", req.method.c_str(), req.path.c_str(), log.dump().c_str()); - }); - } else { - svr.set_logger([](const Request& req, const Response& res) { - json log = { - { "status", res.status }, - { "request", req.body }, - { "response", res.body }, - }; - fprintf(stdout, "http_request: request: %s %s \n", req.method.c_str(), req.path.c_str()); + svr.set_logger([](const Request& req, const Response& res) { + json log = { + { "status", res.status }, + { "request", req.body }, + { "response", res.body }, + }; + fprintf(stdout, "http_request: request: %s %s \nhttp_request: log: %s\n", req.method.c_str(), req.path.c_str(), log.dump().c_str()); }); - } svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { auto fmt = "500 Internal Server Error\n%s"; From 1b96df2b5fdb8d866c245c572ca57959b94db043 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Wed, 31 May 2023 16:42:43 -0400 Subject: [PATCH 052/121] Spacing fix. Nothing to see here. --- examples/server/server.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8436db10ee887..3766ac377e9a4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -914,12 +914,12 @@ int main(int argc, char **argv) svr.set_logger([](const Request& req, const Response& res) { json log = { - { "status", res.status }, - { "request", req.body }, - { "response", res.body }, + { "status", res.status }, + { "request", req.body }, + { "response", res.body }, }; fprintf(stdout, "http_request: request: %s %s \nhttp_request: log: %s\n", req.method.c_str(), req.path.c_str(), log.dump().c_str()); - }); + }); svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { auto fmt = "500 Internal Server Error\n%s"; From 276fa99873273e235fa521796c1338e87491e04b Mon Sep 17 00:00:00 2001 From: digiwombat Date: Wed, 31 May 2023 16:45:57 -0400 Subject: [PATCH 053/121] Misunderstood the instructions, I think. Back to the raw JSON output only. --- examples/server/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3766ac377e9a4..fda7f7b3ebdbf 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -915,10 +915,11 @@ int main(int argc, char **argv) svr.set_logger([](const Request& req, const Response& res) { json log = { { "status", res.status }, + { "path", req.path }, { "request", req.body }, { "response", res.body }, }; - fprintf(stdout, "http_request: request: %s %s \nhttp_request: log: %s\n", req.method.c_str(), req.path.c_str(), log.dump().c_str()); + fprintf(stdout, "http_request: %s\n", log.dump().c_str()); }); svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { From 43d295fddc50e0838bc9a1866d9f187327ef5689 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 16:54:12 -0300 Subject: [PATCH 054/121] filter empty stopping strings --- examples/server/server.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fda7f7b3ebdbf..4d4f3fc48b52e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -608,10 +608,12 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & res.status = 400; return false; } + llama.params.antiprompt.clear(); if (!body["stop"].is_null()) { - llama.params.antiprompt = body["stop"].get>(); - } else { - llama.params.antiprompt.clear(); + const auto stop = body["stop"].get>(); + std::copy_if(stop.begin(), stop.end(), + std::back_inserter(llama.params.antiprompt), + [](const std::string &str) { return !str.empty(); }); } if (llama.verbose) { From 1bd7cc60a8c63a689e62a2600c5678cf5192bd3e Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 17:58:43 -0300 Subject: [PATCH 055/121] reuse format_generation_settings for logging --- examples/server/server.cpp | 84 ++++++++++++++------------------------ 1 file changed, 30 insertions(+), 54 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4d4f3fc48b52e..ef3751ea70ad8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -507,6 +507,31 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para return true; } +json format_generation_settings(llama_server_context &llama) { + const bool ignore_eos = -INFINITY == llama.params.logit_bias[llama_token_eos()]; + return json { + { "seed", llama.params.seed }, + { "temp", llama.params.temp }, + { "top_k", llama.params.top_k }, + { "top_p", llama.params.top_p }, + { "tfs_z", llama.params.tfs_z }, + { "typical_p", llama.params.typical_p }, + { "repeat_last_n", llama.params.repeat_last_n }, + { "repeat_penalty", llama.params.repeat_penalty }, + { "presence_penalty", llama.params.presence_penalty }, + { "frequency_penalty", llama.params.frequency_penalty }, + { "mirostat", llama.params.mirostat }, + { "mirostat_tau", llama.params.mirostat_tau }, + { "mirostat_eta", llama.params.mirostat_eta }, + { "penalize_nl", llama.params.penalize_nl }, + { "stop", llama.params.antiprompt }, + { "n_predict", llama.params.n_predict }, + { "n_keep", llama.params.n_keep }, + { "ignore_eos", ignore_eos }, + { "stream", llama.stream }, + }; +} + bool parse_options_completion(json body, llama_server_context& llama, Response &res) { gpt_params default_params; @@ -617,67 +642,18 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (llama.verbose) { - std::string tmp_stop = - std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(), - std::string{}, [](std::string a, std::string b) { - return a + (a != "" ? ", \"" : "\"") + b + "\""; - }); - + json tmp = format_generation_settings(llama); fprintf(stderr, "-------------------------\n" - "/completion parameters: {\n" - " stream: %d,\n" - " ignore_eos: %d,\n" - " frequency_penalty: %f,\n" - " mirostat: %d,\n" - " mirostat_eta: %f,\n" - " mirostat_tau: %f,\n" - " n_keep: %d,\n" - " n_predict: %d,\n" - " penalize_nl: %d,\n" - " presence_penalty: %f,\n" - " repeat_last_n: %d,\n" - " repeat_penalty: %f,\n" - " seed: %d,\n" - " stop: [%s],\n" - " temperature: %f,\n" - " tfs_z: %f,\n" - " top_k: %d,\n" - " top_p: %f,\n" - " typical_p: %f,\n" - "}\nPROMPT[%s]\n", - llama.stream, -INFINITY == llama.params.logit_bias[llama_token_eos()], - llama.params.frequency_penalty, llama.params.mirostat, - llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep, - llama.params.n_predict, llama.params.penalize_nl, - llama.params.presence_penalty, llama.params.repeat_last_n, - llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(), - llama.params.temp, llama.params.tfs_z, llama.params.top_k, llama.params.top_p, - llama.params.typical_p, llama.params.prompt.c_str()); + "/completion parameters: %s\n" + "PROMPT[%s]\n", + tmp.dump(4, ' ', false, json::error_handler_t::replace).c_str(), + llama.params.prompt.c_str()); } return true; } -json format_generation_settings(const llama_server_context& llama) { - return json { - { "seed", llama.params.seed }, - { "temp", llama.params.temp }, - { "top_k", llama.params.top_k }, - { "top_p", llama.params.top_p }, - { "tfs_z", llama.params.tfs_z }, - { "typical_p", llama.params.typical_p }, - { "repeat_last_n", llama.params.repeat_last_n }, - { "repeat_penalty", llama.params.repeat_penalty }, - { "presence_penalty", llama.params.presence_penalty }, - { "frequency_penalty", llama.params.frequency_penalty }, - { "mirostat", llama.params.mirostat }, - { "mirostat_tau", llama.params.mirostat_tau }, - { "mirostat_eta", llama.params.mirostat_eta }, - { "penalize_nl", llama.params.penalize_nl } - }; -} - std::string log(const Request &req, const Response &res) { std::string s; From 497160a60d4c7f3ddfebc5453042fb2ef9c6c682 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 18:01:07 -0300 Subject: [PATCH 056/121] remove old log function --- examples/server/server.cpp | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ef3751ea70ad8..beb335dfa8aaa 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -654,44 +654,6 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & return true; } -std::string log(const Request &req, const Response &res) -{ - std::string s; - - s += "============ REQUEST ===========\n"; - s += "< "; - s += req.method; - s += " "; - s += req.path; - s += " "; - s += req.version; - s += "\n"; - - if (!req.body.empty()) { - std::string line; - std::istringstream stream(req.body); - while (std::getline(stream, line)) { - s += "< " + line + "\n"; - } - } - - s += "------------ RESPONSE ------------\n> "; - s += res.version; - s += " "; - s += std::to_string(res.status); - s += "\n"; - - if (!res.body.empty()) { - std::string line; - std::istringstream stream(res.body); - while (std::getline(stream, line)) { - s += "> " + line + "\n"; - } - } - - return s; -} - int main(int argc, char **argv) { llama_init_backend(); From 9104fe5a7cf26a811a360ef0000c7ae195748819 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 31 May 2023 11:47:55 +0300 Subject: [PATCH 057/121] Change how the token buffers work. There is now just embd (and last_n_tokens). The input can also be of any length in which case it will be truncated like it normally would. --- examples/server/server.cpp | 264 +++++++++++++++++-------------------- 1 file changed, 124 insertions(+), 140 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index beb335dfa8aaa..71d93cb338d9d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -14,6 +14,12 @@ struct server_params bool verbose = false; }; +static size_t common_part(const std::vector & a, const std::vector & b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++); + return i; +} + struct llama_server_context { bool stream = false; @@ -28,10 +34,7 @@ struct llama_server_context std::vector embd; std::vector last_n_tokens; - std::vector processed_tokens; - std::vector embd_inp; - std::vector last_prompt_tokens; llama_context *ctx = nullptr; gpt_params params; @@ -55,11 +58,10 @@ struct llama_server_context generated_text.reserve(params.n_ctx); stopping_word = ""; - //processed_tokens.clear(); - embd_inp.clear(); n_remain = 0; n_past = 0; n_consumed = 0; + last_n_tokens.clear(); } bool loadModel(const gpt_params ¶ms_) @@ -80,177 +82,159 @@ struct llama_server_context bool loadPrompt() { params.prompt.insert(0, 1, ' '); // always add a first space std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); - if (prompt_tokens == last_prompt_tokens) - { - embd.clear(); + + if (params.n_keep < 0) { + params.n_keep = (int)prompt_tokens.size(); } - // compare the evaluated prompt with the new prompt - for (n_past = 0; n_past < prompt_tokens.size() - 1 && n_past < processed_tokens.size(); n_past++) { - if (prompt_tokens[n_past] != processed_tokens[n_past]) { - break; - } + params.n_keep = std::min(params.n_ctx - 4, params.n_keep); + + // if input prompt is too big, truncate like normal + if (prompt_tokens.size() >= (size_t)params.n_ctx) { + const int n_left = (params.n_ctx - params.n_keep)/2; + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); + new_tokens.insert(new_tokens.end(), prompt_tokens.end() - n_left, prompt_tokens.end()); + prompt_tokens = new_tokens; } - processed_tokens.resize(n_past); - if (prompt_tokens.size() > n_past) { - embd_inp.insert(embd_inp.end(), prompt_tokens.begin() + n_past, prompt_tokens.end()); + + // compare the evaluated prompt with the new prompt + n_past = common_part(embd, prompt_tokens); + embd = prompt_tokens; + if (n_past == prompt_tokens.size()) { + // we have to evaluate at least 1 token to generate logits. + n_past--; } - last_prompt_tokens = prompt_tokens; has_next_token = true; return true; } void beginCompletion() { - if(n_remain == 0) { - // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) - { - params.n_keep = (int)embd_inp.size(); - } - } + // number of tokens to keep when resetting context + + n_remain = params.n_predict; llama_set_rng_seed(ctx, params.seed); } llama_token nextToken() { llama_token result = -1; - if (embd.size() > 0) + + if (embd.size() >= (size_t)params.n_ctx) { + // Reset context + const int n_left = (params.n_ctx - params.n_keep)/2; + + std::vector new_tokens(embd.begin(), embd.begin() + params.n_keep); + new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); + embd = new_tokens; + n_past = params.n_keep; + } + + while (n_past < embd.size()) { - if (n_past + embd.size() > (size_t)params.n_ctx) + int n_eval = (int)embd.size() - n_past; + if (n_eval > params.n_batch) { - // Reset context - const int n_left = n_past - params.n_keep; - n_past = std::max(1, params.n_keep); - //processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end()); - embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size()); + n_eval = params.n_batch; } - for (int i = 0; i < (int)embd.size(); i += params.n_batch) + if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) { - int n_eval = (int)embd.size() - i; - if (n_eval > params.n_batch) - { - n_eval = params.n_batch; - } - if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) - { - fprintf(stderr, "%s : failed to eval\n", __func__); - has_next_token = false; - return result; - } - n_past += n_eval; + fprintf(stderr, "%s : failed to eval\n", __func__); + has_next_token = false; + return result; } + n_past += n_eval; } - embd.clear(); - if (embd_inp.size() <= n_consumed) + + // out of user input, sample next token + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + llama_token id = 0; { - // out of user input, sample next token - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - llama_token id = 0; + auto logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); + + // Apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); + logits[it->first] += it->second; + } - // Apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) - { - logits[it->first] += it->second; - } + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) + { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) - { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } + llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; + + // Apply penalties + float nl_logit = logits[llama_token_nl()]; + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); + llama_sample_repetition_penalty(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + if (!penalize_nl) + { + logits[llama_token_nl()] = nl_logit; + } - llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; - - // Apply penalties - float nl_logit = logits[llama_token_nl()]; - auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); - llama_sample_repetition_penalty(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); - if (!penalize_nl) + if (temp <= 0) + { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); + } + else + { + if (mirostat == 1) { - logits[llama_token_nl()] = nl_logit; + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); } - - if (temp <= 0) + else if (mirostat == 2) { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); } else { - if (mirostat == 1) - { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } - else if (mirostat == 2) - { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } - else - { - // Temperature sampling - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - processed_tokens.push_back(id); - num_tokens_predicted++; - } - - // add it to the context - embd.push_back(id); - result = id; - // decrement remaining sampling budget - --n_remain; - } - else - { - // some user input remains from prompt or interaction, forward it to processing - while (embd_inp.size() > n_consumed) - { - embd.push_back(embd_inp[n_consumed]); - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[n_consumed]); - processed_tokens.push_back(embd_inp[n_consumed]); - ++n_consumed; - if ((int)embd.size() >= params.n_batch) - { - break; + // Temperature sampling + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); } } + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + num_tokens_predicted++; } + // add it to the context + embd.push_back(id); + result = id; + // decrement remaining sampling budget + --n_remain; + if (!embd.empty() && embd.back() == llama_token_eos()) { stopping_word = llama_token_to_str(ctx, embd.back()); has_next_token = false; From bed308c69cebe0084a88b002a48e7326ac3706e8 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 1 Jun 2023 01:15:48 +0300 Subject: [PATCH 058/121] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- examples/server/server.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 71d93cb338d9d..ce9e3e6d7393a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2,8 +2,8 @@ #include "llama.h" #include "build-info.h" -#include -#include +#include "httplib.h" +#include "json.hpp" struct server_params { @@ -161,7 +161,7 @@ struct llama_server_context const bool penalize_nl = params.penalize_nl; llama_token id = 0; { - auto logits = llama_get_logits(ctx); + auto *logits = llama_get_logits(ctx); auto n_vocab = llama_n_vocab(ctx); // Apply params.logit_bias map @@ -692,7 +692,7 @@ int main(int argc, char **argv) llama.rewind(); llama_reset_timings(llama.ctx); - if (parse_options_completion(json::parse(req.body), llama, res) == false) { + if (!parse_options_completion(json::parse(req.body), llama, res)) { return; } @@ -847,10 +847,10 @@ int main(int argc, char **argv) }); svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { - auto fmt = "500 Internal Server Error\n%s"; + const auto *fmt = "500 Internal Server Error\n%s"; char buf[BUFSIZ]; try { - std::rethrow_exception(ep); + std::rethrow_exception(std::move(ep)); } catch (std::exception &e) { snprintf(buf, sizeof(buf), fmt, e.what()); } catch (...) { From 342604bb81c67ae390bed0193d2a60b8df4f9f84 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Wed, 31 May 2023 19:54:05 -0400 Subject: [PATCH 059/121] Added a super simple CORS header as default for all endpoints. --- examples/server/server.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ce9e3e6d7393a..ecec71db890d6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -674,6 +674,8 @@ int main(int argc, char **argv) Server svr; + svr.set_default_headers({ {"Access-Control-Allow-Origin", "*"} }); + svr.Get("/", [](const Request &, Response &res) { res.set_content("

llama.cpp server works

", "text/html"); }); From e9b1f0bf5c6b810a8f77c1173fb4b6bb6b6f72e8 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 20:31:58 -0300 Subject: [PATCH 060/121] fix stopping strings --- examples/server/server.cpp | 94 +++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 16 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ecec71db890d6..c12a84fa7af4b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -20,6 +20,33 @@ static size_t common_part(const std::vector & a, const std::vector< return i; } +enum stop_type { + STOP_FULL, + STOP_PARTIAL, +}; + +bool ends_with(const std::string &str, const std::string &suffix) +{ + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +} + +size_t find_partial_stop_string(const std::string &stop, const std::string &text) +{ + if (!text.empty()) { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { + if (stop[char_index] == text_last_char) { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) { + return text.size() - char_index - 1; + } + } + } + } + return std::string::npos; +} + struct llama_server_context { bool stream = false; @@ -248,6 +275,31 @@ struct llama_server_context return result; } + size_t findStoppingStrings(const std::string &text, const size_t last_token_size, + const stop_type type) + { + size_t stop_pos = std::string::npos; + for (const std::string &word : params.antiprompt) { + size_t pos; + if (type == STOP_FULL) { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); + } else { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) { + if (type == STOP_FULL) { + stopping_word = word; + has_next_token = false; + } + stop_pos = pos; + } + } + return stop_pos; + } + std::string doCompletion() { llama_token token = nextToken(); @@ -272,16 +324,6 @@ struct llama_server_context stopping_word.c_str()); } - for (const std::string& word : params.antiprompt) { - size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size())); - if (i != std::string::npos) { - generated_text.erase(generated_text.begin() + i, generated_text.end()); - stopping_word = word; - has_next_token = false; - break; - } - } - return token_text; } @@ -711,7 +753,14 @@ int main(int argc, char **argv) if (!llama.stream) { while (llama.has_next_token) { - llama.doCompletion(); + const std::string token_text = llama.doCompletion(); + const size_t stop_pos = llama.findStoppingStrings( + llama.generated_text, token_text.size(), STOP_FULL); + + if (stop_pos != std::string::npos) { + llama.generated_text.erase(llama.generated_text.begin() + stop_pos, + llama.generated_text.end()); + } } json data = {{"content", llama.generated_text}, @@ -724,7 +773,7 @@ int main(int argc, char **argv) llama_print_timings(llama.ctx); - return res.set_content( + res.set_content( data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), "application/json"); } else { @@ -733,7 +782,7 @@ int main(int argc, char **argv) int32_t multibyte_pending = 0; while (llama.has_next_token) { - std::string token_text = llama.doCompletion(); + const std::string token_text = llama.doCompletion(); if (multibyte_pending > 0) { multibyte_pending -= token_text.size(); @@ -761,8 +810,22 @@ int main(int argc, char **argv) continue; } - const size_t pos = std::min(sent_count, llama.generated_text.size()); - std::string to_send = llama.generated_text.substr(pos); + size_t pos = std::min(sent_count, llama.generated_text.size()); + + const char *str_test = llama.generated_text.c_str() + pos; + size_t stop_pos = + llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); + if (stop_pos != std::string::npos) { + llama.generated_text.erase( + llama.generated_text.begin() + pos + stop_pos, + llama.generated_text.end()); + pos = std::min(sent_count, llama.generated_text.size()); + } else { + stop_pos = llama.findStoppingStrings(str_test, token_text.size(), + STOP_PARTIAL); + } + + std::string to_send = llama.generated_text.substr(pos, stop_pos); sent_count += to_send.size(); json data; @@ -808,7 +871,6 @@ int main(int argc, char **argv) } }); - svr.Post("/tokenize", [&llama](const Request &req, Response &res) { json body = json::parse(req.body); From f7882e2d6943e3ddadcbd287d8f181c10276afdb Mon Sep 17 00:00:00 2001 From: digiwombat Date: Wed, 31 May 2023 20:35:28 -0400 Subject: [PATCH 061/121] Fixed a crash caused by erasing from empty last_n_tokens --- examples/server/server.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c12a84fa7af4b..3e27a7bbbb064 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -251,7 +251,10 @@ struct llama_server_context id = llama_sample_token(ctx, &candidates_p); } } - last_n_tokens.erase(last_n_tokens.begin()); + if (!last_n_tokens.empty()) + { + last_n_tokens.erase(last_n_tokens.begin()); + } last_n_tokens.push_back(id); num_tokens_predicted++; } From 5bbc030338561641534d65968977a0016662e96d Mon Sep 17 00:00:00 2001 From: Felix Hellmann Date: Thu, 1 Jun 2023 10:47:53 +0200 Subject: [PATCH 062/121] Add Options enpoints and Access-Control-Allow-Headers to satisfy CORS rules --- examples/server/server.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3e27a7bbbb064..b011ff7cd5ed0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -719,7 +719,10 @@ int main(int argc, char **argv) Server svr; - svr.set_default_headers({ {"Access-Control-Allow-Origin", "*"} }); + svr.set_default_headers({ + {"Access-Control-Allow-Origin", "*"}, + {"Access-Control-Allow-Headers", "content-type"} + }); svr.Get("/", [](const Request &, Response &res) { res.set_content("

llama.cpp server works

", "text/html"); }); @@ -874,6 +877,11 @@ int main(int argc, char **argv) } }); + svr.Options(R"(/.*)", [&llama](const Request &req, Response &res) + { + return res.set_content("", "application/json"); + }); + svr.Post("/tokenize", [&llama](const Request &req, Response &res) { json body = json::parse(req.body); From 8c6a5fc92bac786f0bb0737a2c98d96096a28ea1 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 1 Jun 2023 13:18:12 +0300 Subject: [PATCH 063/121] last tokens fixes --- examples/server/server.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3e27a7bbbb064..fc24f9c13b45a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -88,7 +88,6 @@ struct llama_server_context n_remain = 0; n_past = 0; n_consumed = 0; - last_n_tokens.clear(); } bool loadModel(const gpt_params ¶ms_) @@ -120,7 +119,12 @@ struct llama_server_context const int n_left = (params.n_ctx - params.n_keep)/2; std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); new_tokens.insert(new_tokens.end(), prompt_tokens.end() - n_left, prompt_tokens.end()); + std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); prompt_tokens = new_tokens; + } else { + size_t ps = prompt_tokens.size(); + std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); + std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); } // compare the evaluated prompt with the new prompt @@ -251,10 +255,7 @@ struct llama_server_context id = llama_sample_token(ctx, &candidates_p); } } - if (!last_n_tokens.empty()) - { - last_n_tokens.erase(last_n_tokens.begin()); - } + last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); num_tokens_predicted++; } From 9531ae60dbd21c19260258cfd19e71fff18bcf7a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 1 Jun 2023 13:57:47 +0300 Subject: [PATCH 064/121] Add logit bias support --- examples/server/server.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fc24f9c13b45a..04a6af47ac164 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -655,6 +655,16 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } else { llama.params.logit_bias.erase(llama_token_eos()); } + if (body["logit_bias"].is_array()) { + int n_vocab = llama_n_vocab(llama.ctx); + for (const auto &el : body["logit_bias"]) { + if (el.is_array() && el.size() == 2 && el[0].is_number_integer() && el[1].is_number_float()) { + llama_token tok = el[0].get(); + if (tok < 0 || tok >= n_vocab) continue; + llama.params.logit_bias[tok] = el[1].get(); + } + } + } if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); } else { From 49a18bdd147d8f470c83ea7bf0eb8faa069dc7a7 Mon Sep 17 00:00:00 2001 From: anon Date: Thu, 1 Jun 2023 09:41:35 -0300 Subject: [PATCH 065/121] remove unused parameter warning --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d0d0c0b79861b..8f0b702d57206 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -888,7 +888,7 @@ int main(int argc, char **argv) } }); - svr.Options(R"(/.*)", [&llama](const Request &req, Response &res) + svr.Options(R"(/.*)", [&llama](const Request &, Response &res) { return res.set_content("", "application/json"); }); From 6025476e3910079c31e4a4f0c7194fdf2913c8a5 Mon Sep 17 00:00:00 2001 From: anon Date: Thu, 1 Jun 2023 09:49:16 -0300 Subject: [PATCH 066/121] default penalize_nl back to true --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8f0b702d57206..b16a49b10d10a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -638,7 +638,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & if (!body["penalize_nl"].is_null()) { llama.params.penalize_nl = body["penalize_nl"].get(); } else { - llama.params.penalize_nl = false; + llama.params.penalize_nl = default_params.penalize_nl; } if (!body["n_keep"].is_null()) { llama.params.n_keep = body["n_keep"].get(); From 8cbc4be6c24fbcab20bb3b0b050b1c5f6bfdb97f Mon Sep 17 00:00:00 2001 From: anon Date: Thu, 1 Jun 2023 09:49:50 -0300 Subject: [PATCH 067/121] clear logit_bias between requests + print --- examples/server/server.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b16a49b10d10a..b345b53c26047 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -559,6 +559,7 @@ json format_generation_settings(llama_server_context &llama) { { "n_keep", llama.params.n_keep }, { "ignore_eos", ignore_eos }, { "stream", llama.stream }, + { "logit_bias", llama.params.logit_bias }, }; } @@ -650,10 +651,10 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } else { llama.params.seed = time(NULL); } + + llama.params.logit_bias.clear(); if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) { llama.params.logit_bias[llama_token_eos()] = -INFINITY; - } else { - llama.params.logit_bias.erase(llama_token_eos()); } if (body["logit_bias"].is_array()) { int n_vocab = llama_n_vocab(llama.ctx); @@ -665,6 +666,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } } } + if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); } else { @@ -673,6 +675,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & res.status = 400; return false; } + llama.params.antiprompt.clear(); if (!body["stop"].is_null()) { const auto stop = body["stop"].get>(); From 0bc047730f0b0c70c0099fbb0d422cbf286ec335 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 2 Jun 2023 10:29:09 +0300 Subject: [PATCH 068/121] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b345b53c26047..afa52a28651ef 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -891,7 +891,7 @@ int main(int argc, char **argv) } }); - svr.Options(R"(/.*)", [&llama](const Request &, Response &res) + svr.Options(R"(/.*)", [](const Request &, Response &res) { return res.set_content("", "application/json"); }); From 731ecc0d1be4cb3119b2e371a544914edf95d591 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 05:45:16 -0300 Subject: [PATCH 069/121] fix typo --- examples/server/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 67b0867545574..74126c687ffd3 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -4,7 +4,7 @@ add_executable(${TARGET} server.cpp json.hpp httplib.h) target_compile_definitions(${TARGET} PRIVATE # single thread CPPHTTPLIB_THREAD_POOL_COUNT=1 - # crash the server in the debug mode, otherwise send http 500 error + # crash the server in debug mode, otherwise send an http 500 error $<$: CPPHTTPLIB_NO_EXCEPTIONS=1 > From ebfead6e5a16d0c77ab3e42aeb61eb10f5c831f8 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 05:45:57 -0300 Subject: [PATCH 070/121] remove unused variables --- examples/server/server.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index afa52a28651ef..c7b8158c6d0a2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -55,8 +55,6 @@ struct llama_server_context size_t num_tokens_predicted = 0; size_t n_past = 0; - size_t n_consumed = 0; - size_t n_session_consumed = 0; size_t n_remain = 0; std::vector embd; @@ -87,7 +85,6 @@ struct llama_server_context n_remain = 0; n_past = 0; - n_consumed = 0; } bool loadModel(const gpt_params ¶ms_) From 1488a0f528f338a50be397baea98ac739d174be3 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 05:47:00 -0300 Subject: [PATCH 071/121] make functions that never return false void --- examples/server/server.cpp | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c7b8158c6d0a2..a3b16cad1a34b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -102,7 +102,7 @@ struct llama_server_context return true; } - bool loadPrompt() { + void loadPrompt() { params.prompt.insert(0, 1, ' '); // always add a first space std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); @@ -132,7 +132,6 @@ struct llama_server_context n_past--; } has_next_token = true; - return true; } void beginCompletion() @@ -389,7 +388,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, "\n"); } -bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_params ¶ms) +void server_params_parse(int argc, char **argv, server_params &sparams, gpt_params ¶ms) { gpt_params default_params; server_params default_sparams; @@ -531,7 +530,6 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para server_print_usage(argc, argv, default_params, default_sparams); exit(1); } - return true; } json format_generation_settings(llama_server_context &llama) { @@ -706,10 +704,7 @@ int main(int argc, char **argv) llama_server_context llama; params.model = "ggml-model.bin"; - if (server_params_parse(argc, argv, sparams, params) == false) - { - return 1; - } + server_params_parse(argc, argv, sparams, params); llama.verbose = sparams.verbose; llama.json_indent = sparams.verbose ? 4 : -1; @@ -757,15 +752,7 @@ int main(int argc, char **argv) return; } - if (!llama.loadPrompt()) { - json data = {{"status", "error"}, {"reason", "Context too long."}}; - res.set_content( - data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), - "application/json"); - res.status = 400; - return; - } - + llama.loadPrompt(); llama.beginCompletion(); if (!llama.stream) { From 49dce94885a72d264afc15e9619239ecda214af5 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 05:51:34 -0300 Subject: [PATCH 072/121] make types match gpt_params exactly --- examples/server/server.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a3b16cad1a34b..fcdc38e8aa690 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -567,12 +567,12 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & llama.stream = false; } if (!body["n_predict"].is_null()) { - llama.params.n_predict = body["n_predict"].get(); + llama.params.n_predict = body["n_predict"].get(); } else { llama.params.n_predict = default_params.n_predict; } if (!body["top_k"].is_null()) { - llama.params.top_k = body["top_k"].get(); + llama.params.top_k = body["top_k"].get(); } else { llama.params.top_k = default_params.top_k; } @@ -592,7 +592,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & llama.params.typical_p = default_params.typical_p; } if (!body["repeat_last_n"].is_null()) { - llama.params.repeat_last_n = body["repeat_last_n"].get(); + llama.params.repeat_last_n = body["repeat_last_n"].get(); } else { llama.params.repeat_last_n = default_params.repeat_last_n; } @@ -617,7 +617,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & llama.params.frequency_penalty = default_params.frequency_penalty; } if (!body["mirostat"].is_null()) { - llama.params.mirostat = body["mirostat"].get(); + llama.params.mirostat = body["mirostat"].get(); } else { llama.params.mirostat = default_params.mirostat; } @@ -632,17 +632,17 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & llama.params.mirostat_eta = default_params.mirostat_eta; } if (!body["penalize_nl"].is_null()) { - llama.params.penalize_nl = body["penalize_nl"].get(); + llama.params.penalize_nl = body["penalize_nl"].get(); } else { llama.params.penalize_nl = default_params.penalize_nl; } if (!body["n_keep"].is_null()) { - llama.params.n_keep = body["n_keep"].get(); + llama.params.n_keep = body["n_keep"].get(); } else { llama.params.n_keep = default_params.n_keep; } if (!body["seed"].is_null()) { - llama.params.seed = body["seed"].get(); + llama.params.seed = body["seed"].get(); } else { llama.params.seed = time(NULL); } From a8a9f1968956ebd65a44ed460d015e4cf60c1d65 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 05:57:20 -0300 Subject: [PATCH 073/121] small fixes --- examples/server/server.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fcdc38e8aa690..d195fb1673be7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -137,8 +137,6 @@ struct llama_server_context void beginCompletion() { // number of tokens to keep when resetting context - - n_remain = params.n_predict; llama_set_rng_seed(ctx, params.seed); } @@ -192,9 +190,8 @@ struct llama_server_context auto n_vocab = llama_n_vocab(ctx); // Apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) - { - logits[it->first] += it->second; + for (const auto &it : params.logit_bias) { + logits[it.first] += it.second; } std::vector candidates; @@ -271,7 +268,7 @@ struct llama_server_context return result; } - has_next_token = params.n_predict == -1 ? true : n_remain != 0; + has_next_token = params.n_predict == -1 || n_remain != 0; return result; } @@ -330,7 +327,7 @@ struct llama_server_context std::vector embedding(std::string content, int threads) { content.insert(0, 1, ' '); std::vector tokens = ::llama_tokenize(ctx, content, true); - if (tokens.size() > 0) + if (!tokens.empty()) { if (llama_eval(ctx, tokens.data(), tokens.size(), 0, threads)) { @@ -340,7 +337,7 @@ struct llama_server_context } } const int n_embd = llama_n_embd(ctx); - const auto embeddings = llama_get_embeddings(ctx); + auto *const embeddings = llama_get_embeddings(ctx); std::vector embeddings_(embeddings, embeddings + n_embd); return embeddings_; } From 2932db15a30b26c0168dd40e826e58df64cb98be Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 06:55:38 -0300 Subject: [PATCH 074/121] avoid creating element in logit_bias accidentally --- examples/server/server.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d195fb1673be7..44f6b49055084 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -530,7 +530,9 @@ void server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } json format_generation_settings(llama_server_context &llama) { - const bool ignore_eos = -INFINITY == llama.params.logit_bias[llama_token_eos()]; + const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); + const bool ignore_eos = + eos_bias != llama.params.logit_bias.end() && -INFINITY == eos_bias->second; return json { { "seed", llama.params.seed }, { "temp", llama.params.temp }, From 47efbb5cf379399416f1cee611b7585507f948bb Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 07:19:21 -0300 Subject: [PATCH 075/121] use std::isinf to check if ignore_eos is active --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 44f6b49055084..7f287e1c782d7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -531,8 +531,8 @@ void server_params_parse(int argc, char **argv, server_params &sparams, gpt_para json format_generation_settings(llama_server_context &llama) { const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); - const bool ignore_eos = - eos_bias != llama.params.logit_bias.end() && -INFINITY == eos_bias->second; + const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && + eos_bias->second < 0.0f && std::isinf(eos_bias->second); return json { { "seed", llama.params.seed }, { "temp", llama.params.temp }, From 88cc7bb6f7f581ef00fd0c753881cb3227caabc0 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 2 Jun 2023 13:29:57 +0300 Subject: [PATCH 076/121] Stuff with logits --- examples/server/server.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index afa52a28651ef..2021130d5f3c2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -538,7 +538,10 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } json format_generation_settings(llama_server_context &llama) { - const bool ignore_eos = -INFINITY == llama.params.logit_bias[llama_token_eos()]; + const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); + const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && + eos_bias->second < 0.0f && std::isinf(eos_bias->second); + return json { { "seed", llama.params.seed }, { "temp", llama.params.temp }, @@ -659,10 +662,15 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & if (body["logit_bias"].is_array()) { int n_vocab = llama_n_vocab(llama.ctx); for (const auto &el : body["logit_bias"]) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer() && el[1].is_number_float()) { + if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { llama_token tok = el[0].get(); - if (tok < 0 || tok >= n_vocab) continue; - llama.params.logit_bias[tok] = el[1].get(); + if (tok >= 0 && tok < n_vocab) { + if (el[1].is_number_float()) { + llama.params.logit_bias[tok] = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + llama.params.logit_bias[tok] = -INFINITY; + } + } } } } From 8f9e546b515341d9ce71aabc01fafb2c78c3c6d9 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 08:14:28 -0300 Subject: [PATCH 077/121] trim partial stopping strings when not streaming --- examples/server/server.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fd11798a256b6..df921989886bd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -761,15 +761,21 @@ int main(int argc, char **argv) llama.beginCompletion(); if (!llama.stream) { + size_t stop_pos = std::string::npos; + while (llama.has_next_token) { const std::string token_text = llama.doCompletion(); - const size_t stop_pos = llama.findStoppingStrings( - llama.generated_text, token_text.size(), STOP_FULL); - if (stop_pos != std::string::npos) { - llama.generated_text.erase(llama.generated_text.begin() + stop_pos, - llama.generated_text.end()); - } + stop_pos = llama.findStoppingStrings(llama.generated_text, + token_text.size(), STOP_FULL); + } + + if (stop_pos == std::string::npos) { + stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); + } + if (stop_pos != std::string::npos) { + llama.generated_text.erase(llama.generated_text.begin() + stop_pos, + llama.generated_text.end()); } json data = {{"content", llama.generated_text}, From f820740dadd92230b202c87683cfc420e713ae7a Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 08:16:39 -0300 Subject: [PATCH 078/121] move multibyte check to doCompletion --- examples/server/server.cpp | 56 ++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index df921989886bd..cd463b405a000 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -67,6 +67,7 @@ struct llama_server_context bool verbose = false; int json_indent = -1; + int32_t multibyte_pending = 0; ~llama_server_context() { @@ -82,6 +83,7 @@ struct llama_server_context generated_text = ""; generated_text.reserve(params.n_ctx); stopping_word = ""; + multibyte_pending = 0; n_remain = 0; n_past = 0; @@ -300,13 +302,33 @@ struct llama_server_context std::string doCompletion() { llama_token token = nextToken(); - if (token == -1) { - return ""; - } - std::string token_text = llama_token_to_str(ctx, token); + std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); generated_text += token_text; + if (multibyte_pending > 0) { + multibyte_pending -= token_text.size(); + } else if (token_text.size() == 1) { + const char c = token_text[0]; + // 2-byte characters: 110xxxxx 10xxxxxx + if ((c & 0xE0) == 0xC0) { + multibyte_pending = 1; + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF0) == 0xE0) { + multibyte_pending = 2; + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF8) == 0xF0) { + multibyte_pending = 3; + } else { + multibyte_pending = 0; + } + } + + if (multibyte_pending > 0 && !has_next_token) { + has_next_token = true; + n_remain++; + } + if (verbose) { fprintf(stderr, "next token: {\n" @@ -794,34 +816,10 @@ int main(int argc, char **argv) } else { const auto chunked_content_provider = [&](size_t, DataSink &sink) { size_t sent_count = 0; - int32_t multibyte_pending = 0; while (llama.has_next_token) { const std::string token_text = llama.doCompletion(); - - if (multibyte_pending > 0) { - multibyte_pending -= token_text.size(); - } else if (token_text.size() == 1) { - const char c = token_text[0]; - // 2-byte characters: 110xxxxx 10xxxxxx - if ((c & 0xE0) == 0xC0) { - multibyte_pending = 1; - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF0) == 0xE0) { - multibyte_pending = 2; - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF8) == 0xF0) { - multibyte_pending = 3; - } else { - multibyte_pending = 0; - } - } - - if (multibyte_pending > 0) { - if (!llama.has_next_token) { - llama.has_next_token = true; - llama.n_remain++; - } + if (llama.multibyte_pending > 0) { continue; } From 3df0192804f6dbb0bcf80511bbebd62fbe0440c5 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 2 Jun 2023 15:18:51 +0300 Subject: [PATCH 079/121] improve long input truncation and add more verbose logging --- examples/server/server.cpp | 71 ++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index cd463b405a000..038fd16c8fe8b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -47,6 +47,27 @@ size_t find_partial_stop_string(const std::string &stop, const std::string &text return std::string::npos; } +static std::string debug_str(const std::string & s) { + std::string ret; + for (size_t i = 0; s[i]; i++) { + switch (s[i]) { + case '\n': ret += "\\n"; break; + case '"': ret += "\\\""; break; + default: ret += s[i]; break; + } + } + return ret; +} + +template +static std::string tokens_to_str(llama_context * ctx, InputIt begin, OutputIt end) { + std::string ret; + for (; begin != end; (void)++begin) { + ret += llama_token_to_str(ctx, *begin); + } + return ret; +} + struct llama_server_context { bool stream = false; @@ -117,8 +138,22 @@ struct llama_server_context if (prompt_tokens.size() >= (size_t)params.n_ctx) { const int n_left = (params.n_ctx - params.n_keep)/2; std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - new_tokens.insert(new_tokens.end(), prompt_tokens.end() - n_left, prompt_tokens.end()); + const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left; + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + + if (verbose) { + fprintf(stderr, + "input truncated: {\n" + " n_ctx: %d,\n" + " n_keep: %d,\n" + " n_left: %d,\n" + " new_tokens: \"%s\",\n" + "}\n", + params.n_ctx, params.n_keep, n_left, + debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); + } + prompt_tokens = new_tokens; } else { size_t ps = prompt_tokens.size(); @@ -133,6 +168,19 @@ struct llama_server_context // we have to evaluate at least 1 token to generate logits. n_past--; } + + if (verbose) { + fprintf(stderr, + "prompt: {\n" + " n_past: %zu,\n" + " cached: \"%s\",\n" + " to_eval: \"%s\",\n" + "}\n", + n_past, + debug_str(tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)).c_str(), + debug_str(tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())).c_str()); + } + has_next_token = true; } @@ -154,6 +202,17 @@ struct llama_server_context new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); embd = new_tokens; n_past = params.n_keep; + if (verbose) { + fprintf(stderr, + "input truncated: {\n" + " n_ctx: %d,\n" + " n_keep: %d,\n" + " n_left: %d,\n" + " new_tokens: \"%s\",\n" + "}\n", + params.n_ctx, params.n_keep, n_left, + debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); + } } while (n_past < embd.size()) @@ -339,8 +398,8 @@ struct llama_server_context " num_tokens_predicted: %ld,\n" " stopping_word: \"%s\",\n" "}\n", - token, token_text.c_str(), has_next_token, n_remain, num_tokens_predicted, - stopping_word.c_str()); + token, debug_str(llama_token_to_str(ctx, token)).c_str(), has_next_token, n_remain, num_tokens_predicted, + debug_str(stopping_word).c_str()); } return token_text; @@ -710,10 +769,10 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & json tmp = format_generation_settings(llama); fprintf(stderr, "-------------------------\n" - "/completion parameters: %s\n" - "PROMPT[%s]\n", + "completion parameters: %s\n" + "full prompt: \"%s\"\n", tmp.dump(4, ' ', false, json::error_handler_t::replace).c_str(), - llama.params.prompt.c_str()); + debug_str(llama.params.prompt).c_str()); } return true; From 3ff27d30e3da54d8e3e44374f47dcc9b67656d41 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Fri, 2 Jun 2023 09:20:53 -0400 Subject: [PATCH 080/121] Fixed up a few things in embedding mode. --- examples/server/server.cpp | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 038fd16c8fe8b..334bf88c51a10 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -958,23 +958,34 @@ int main(int argc, char **argv) svr.Post("/embedding", [&llama](const Request &req, Response &res) { + json data; if(!llama.params.embedding) { std::vector empty; - json data = { - {"embedding", empty}}; - fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n"); + data = { + {"embedding", empty}, + {"error", "Server is not in embedding mode."} }; + fprintf(stderr, "[llama-server] : You need to enable embedding mode by adding --embedding when launching the server.\n"); return res.set_content(data.dump(llama.json_indent), "application/json"); } json body = json::parse(req.body); - std::string content = body["content"].get(); - int threads = body["threads"].get(); - json data = { - {"embedding", llama.embedding(content, threads) } }; + if (body["content"].is_null()) { + std::vector empty; + data = { + {"embedding", empty}, + {"error", "The embedding content was not set."} }; + fprintf(stderr, "[llama-server] : The embedding content was not set.\n"); + } + else + { + std::string content = body["content"].get(); + data = { + {"embedding", llama.embedding(content, llama.params.n_threads) } }; + } return res.set_content(data.dump(llama.json_indent), "application/json"); }); if(params.embedding) { - fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n"); + fprintf(stderr, "NOTE: Embedding mode enabled. Completion is disabled in this mode.\n"); } svr.set_logger([](const Request& req, const Response& res) { From 41bb71bde759d2826faf199b6927a241979f6f20 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 10:37:13 -0300 Subject: [PATCH 081/121] replace invalid characters instead of crashing While logging the requests. --- examples/server/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 334bf88c51a10..cd0fb0a23fc52 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -995,7 +995,8 @@ int main(int argc, char **argv) { "request", req.body }, { "response", res.body }, }; - fprintf(stdout, "http_request: %s\n", log.dump().c_str()); + fprintf(stdout, "http_request: %s\n", + log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); }); svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { From 16e1c9813add1fef748d2d93669c9554e1da7567 Mon Sep 17 00:00:00 2001 From: digiwombat Date: Fri, 2 Jun 2023 10:05:52 -0400 Subject: [PATCH 082/121] Removed the embedding api endpoint and associated code. --- examples/server/server.cpp | 65 -------------------------------------- 1 file changed, 65 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 334bf88c51a10..247e54dc9096e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -404,24 +404,6 @@ struct llama_server_context return token_text; } - - std::vector embedding(std::string content, int threads) { - content.insert(0, 1, ' '); - std::vector tokens = ::llama_tokenize(ctx, content, true); - if (!tokens.empty()) - { - if (llama_eval(ctx, tokens.data(), tokens.size(), 0, threads)) - { - fprintf(stderr, "%s : failed to eval\n", __func__); - std::vector embeddings_; - return embeddings_; - } - } - const int n_embd = llama_n_embd(ctx); - auto *const embeddings = llama_get_embeddings(ctx); - std::vector embeddings_(embeddings, embeddings + n_embd); - return embeddings_; - } }; using namespace httplib; @@ -440,7 +422,6 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - fprintf(stderr, " --embedding enable embedding mode\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); if (llama_mlock_supported()) { @@ -521,10 +502,6 @@ void server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } params.model_alias = argv[i]; } - else if (arg == "--embedding") - { - params.embedding = true; - } else if (arg == "-h" || arg == "--help") { server_print_usage(argc, argv, default_params, default_sparams); @@ -820,16 +797,6 @@ int main(int argc, char **argv) { res.set_content("

llama.cpp server works

", "text/html"); }); svr.Post("/completion", [&llama](const Request &req, Response &res) { - if (llama.params.embedding) { - json data = { - {"status", "error"}, - {"reason", "To use completion function, disable embedding mode"}}; - res.set_content( - data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), - "application/json"); - res.status = 400; - return; - } llama.rewind(); llama_reset_timings(llama.ctx); @@ -956,38 +923,6 @@ int main(int argc, char **argv) return res.set_content(data.dump(llama.json_indent), "application/json"); }); - svr.Post("/embedding", [&llama](const Request &req, Response &res) - { - json data; - if(!llama.params.embedding) { - std::vector empty; - data = { - {"embedding", empty}, - {"error", "Server is not in embedding mode."} }; - fprintf(stderr, "[llama-server] : You need to enable embedding mode by adding --embedding when launching the server.\n"); - return res.set_content(data.dump(llama.json_indent), "application/json"); - } - json body = json::parse(req.body); - if (body["content"].is_null()) { - std::vector empty; - data = { - {"embedding", empty}, - {"error", "The embedding content was not set."} }; - fprintf(stderr, "[llama-server] : The embedding content was not set.\n"); - } - else - { - std::string content = body["content"].get(); - data = { - {"embedding", llama.embedding(content, llama.params.n_threads) } }; - } - return res.set_content(data.dump(llama.json_indent), "application/json"); - }); - - if(params.embedding) { - fprintf(stderr, "NOTE: Embedding mode enabled. Completion is disabled in this mode.\n"); - } - svr.set_logger([](const Request& req, const Response& res) { json log = { { "status", res.status }, From bcd616700e561424db77bfabc334f13b811f9968 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 2 Jun 2023 18:04:46 +0300 Subject: [PATCH 083/121] improve docs and example --- examples/server/README.md | 275 +++++++------------------------------- examples/server/chat.mjs | 61 +++++++++ 2 files changed, 109 insertions(+), 227 deletions(-) create mode 100644 examples/server/chat.mjs diff --git a/examples/server/README.md b/examples/server/README.md index bba513c7eba8b..7151fcd4dd4f4 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -2,32 +2,45 @@ This example allow you to have a llama.cpp http server to interact from a web page or consume the API. -## Table of Contents +Command line options: + +- `--threads N`, `-t N`: use N threads. +- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**. +- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`; +- `--port`: Set the port to listen. Default: `8080`. -1. [Quick Start](#quick-start) -2. [Node JS Test](#node-js-test) -3. [API Endpoints](#api-endpoints) -4. [More examples](#more-examples) -5. [Common Options](#common-options) -6. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options) ## Quick Start To get started right away, run the following command, making sure to use the correct path for the model you have: -#### Unix-based systems (Linux, macOS, etc.): +### Unix-based systems (Linux, macOS, etc.): ```bash -./server -m models/7B/ggml-model.bin --ctx_size 2048 +./server -m models/7B/ggml-model.bin -c 2048 ``` -#### Windows: +### Windows: ```powershell -server.exe -m models\7B\ggml-model.bin --ctx_size 2048 +server.exe -m models\7B\ggml-model.bin -c 2048 ``` -That will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library. +That will start a server that by default listens on `127.0.0.1:8080`. +You can consume the endpoints with Postman or NodeJS with axios library. + +## Testing with CURL + +Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS. + +```sh +curl --request POST \ + --url http://localhost:8080/completion \ + --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' +``` ## Node JS Test @@ -50,7 +63,6 @@ const prompt = `Building a website can be done in 10 simple steps:`; async function Test() { let result = await axios.post("http://127.0.0.1:8080/completion", { prompt, - batch_size: 128, n_predict: 512, }); @@ -69,244 +81,53 @@ node . ## API Endpoints -You can interact with this API Endpoints. This implementations just support chat style interaction. +You can interact with this API Endpoints. +This implementations just support chat style interaction. - **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks. -*Options:* - -`batch_size`: Set the batch size for prompt processing (default: 512). - -`temperature`: Adjust the randomness of the generated text (default: 0.8). - -`top_k`: Limit the next token selection to the K most probable tokens (default: 40). + *Options:* -`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). + `temperature`: Adjust the randomness of the generated text (default: 0.8). -`n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity). + `top_k`: Limit the next token selection to the K most probable tokens (default: 40). -`threads`: Set the number of threads to use during computation. + `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). -`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. + `n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity). -`as_loop`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. + `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. + By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. -`interactive`: It allows interacting with the completion, and the completion stops as soon as it encounters a `stop word`. To enable this, set to `true`. + `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. -`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. + `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. -`stop`: Specify the words or characters that indicate a stop. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. - -`exclude`: Specify the words or characters you do not want to appear in the completion. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. + `stop`: Specify the strings that indicate a stop. + These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. + Default: `[]` - **POST** `hostname:port/embedding`: Generate embedding of a given text -*Options:* - -`content`: Set the text to get generate the embedding. + *Options:* -`threads`: Set the number of threads to use during computation. + `content`: Set the text to get generate the embedding. -To use this endpoint, you need to start the server with the `--embedding` option added. + To use this endpoint, you need to start the server with the `--embedding` option added. - **POST** `hostname:port/tokenize`: Tokenize a given text -*Options:* - -`content`: Set the text to tokenize. + *Options:* -- **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request. - -*Options:* - -`stop`: Set `hostname:port/next-token?stop=true` to stop the token generation. + `content`: Set the text to tokenize. ## More examples ### Interactive mode -This mode allows interacting in a chat-like manner. It is recommended for models designed as assistants such as `Vicuna`, `WizardLM`, `Koala`, among others. Make sure to add the correct stop word for the corresponding model. - -The prompt should be generated by you, according to the model's guidelines. You should keep adding the model's completions to the context as well. - -This example works well for `Vicuna - version 1`. - -```javascript -const axios = require("axios"); - -let prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. -### Human: Hello, Assistant. -### Assistant: Hello. How may I help you today? -### Human: Please tell me the largest city in Europe. -### Assistant: Sure. The largest city in Europe is Moscow, the capital of Russia.`; - -async function ChatCompletion(answer) { - // the user's next question to the prompt - prompt += `\n### Human: ${answer}\n` - - result = await axios.post("http://127.0.0.1:8080/completion", { - prompt, - batch_size: 128, - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: -1, - n_predict: 2048, - stop: ["\n### Human:"], // when detect this, stop completion - exclude: ["### Assistant:"], // no show in the completion - threads: 8, - as_loop: true, // use this to request the completion token by token - interactive: true, // enable the detection of a stop word - }); - - // create a loop to receive every token predicted - // note: this operation is blocking, avoid use this in a ui thread - - let message = ""; - while (true) { - // you can stop the inference adding '?stop=true' like this http://127.0.0.1:8080/next-token?stop=true - result = await axios.get("http://127.0.0.1:8080/next-token"); - process.stdout.write(result.data.content); - message += result.data.content; - - // to avoid an infinite loop - if (result.data.stop) { - console.log("Completed"); - // make sure to add the completion to the prompt. - prompt += `### Assistant: ${message}`; - break; - } - } -} - -// This function should be called every time a question to the model is needed. -async function Test() { - // the server can't inference in paralell - await ChatCompletion("Write a long story about a time magician in a fantasy world"); - await ChatCompletion("Summary the story"); -} - -Test(); -``` - -### Alpaca example - -**Temporaly note:** no tested, if you have the model, please test it and report me some issue - -```javascript -const axios = require("axios"); - -let prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request. -`; - -async function DoInstruction(instruction) { - prompt += `\n\n### Instruction:\n\n${instruction}\n\n### Response:\n\n`; - result = await axios.post("http://127.0.0.1:8080/completion", { - prompt, - batch_size: 128, - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: -1, - n_predict: 2048, - stop: ["### Instruction:\n\n"], // when detect this, stop completion - exclude: [], // no show in the completion - threads: 8, - as_loop: true, // use this to request the completion token by token - interactive: true, // enable the detection of a stop word - }); - - // create a loop to receive every token predicted - // note: this operation is blocking, avoid use this in a ui thread - - let message = ""; - while (true) { - result = await axios.get("http://127.0.0.1:8080/next-token"); - process.stdout.write(result.data.content); - message += result.data.content; - - // to avoid an infinite loop - if (result.data.stop) { - console.log("Completed"); - // make sure to add the completion and the user's next question to the prompt. - prompt += message; - break; - } - } -} - -// This function should be called every time a instruction to the model is needed. -DoInstruction("Destroy the world"); // as joke -``` - -### Embeddings - -First, run the server with `--embedding` option: - -```bash -server -m models/7B/ggml-model.bin --ctx_size 2048 --embedding -``` - -Run this code in NodeJS: - -```javascript -const axios = require('axios'); - -async function Test() { - let result = await axios.post("http://127.0.0.1:8080/embedding", { - content: `Hello`, - threads: 5 - }); - // print the embedding array - console.log(result.data.embedding); -} +Check the sample in [chat.mjs](chat.mjs). +Run with node: -Test(); +```sh +node chat.mjs ``` - -### Tokenize - -Run this code in NodeJS: - -```javascript -const axios = require('axios'); - -async function Test() { - let result = await axios.post("http://127.0.0.1:8080/tokenize", { - content: `Hello` - }); - // print the embedding array - console.log(result.data.tokens); -} - -Test(); -``` - -## Common Options - -- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). -- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. -- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**. -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`; -- `--port`: Set the port to listen. Default: `8080`. - -### RNG Seed - -- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed). - -The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run. - -## Performance Tuning and Memory Options - -### No Memory Mapping - -- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. - -### Memory Float 32 - -- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended. - -## Limitations: - -- The actual implementation of llama.cpp need a `llama-state` for handle multiple contexts and clients, but this could require more powerful hardware. diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs new file mode 100644 index 0000000000000..5f1431f9d5145 --- /dev/null +++ b/examples/server/chat.mjs @@ -0,0 +1,61 @@ +import * as readline from 'node:readline/promises'; +import { stdin as input, stdout as output } from 'node:process'; + +const chat = [ + { human: "Hello, Assistant.", + assistant: "Hello. How may I help you today?" }, + { human: "Please tell me the largest city in Europe.", + assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia." }, +] + +function format_prompt(question) { + return "A chat between a curious human and an artificial intelligence assistant. " + + "The assistant gives helpful, detailed, and polite answers to the human's questions.\n" + + chat.map(m => `### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n") + + `\n### Human: ${question}\n### Assistant:` +} + +async function ChatCompletion(question) { + const result = await fetch("http://127.0.0.1:8080/completion", { + method: 'POST', + body: JSON.stringify({ + prompt: format_prompt(question), + temperature: 0.2, + top_k: 40, + top_p: 0.9, + n_keep: 29, + n_predict: 256, + stop: ["\n### Human:"], // when detect this, stop completion + stream: true, + }) + }) + + if (!result.ok) { + return; + } + + let answer = '' + + for await (var chunk of result.body) { + const t = Buffer.from(chunk).toString('utf8') + if (t.startsWith('data: ')) { + const message = JSON.parse(t.substring(6)) + answer += message.content + process.stdout.write(message.content) + if (message.stop) break; + } + } + + process.stdout.write('\n') + chat.push({ human: question, assistant: answer }) +} + +const rl = readline.createInterface({ input, output }); + +while(true) { + + const question = await rl.question('> ') + await ChatCompletion(question); + +} + From de6df486e95480b73f28d26f952c47919212df1a Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Fri, 2 Jun 2023 08:24:46 -0700 Subject: [PATCH 084/121] Removed embedding from README --- examples/server/README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 7151fcd4dd4f4..d5ca24cf86ac3 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -107,14 +107,6 @@ This implementations just support chat style interaction. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` -- **POST** `hostname:port/embedding`: Generate embedding of a given text - - *Options:* - - `content`: Set the text to get generate the embedding. - - To use this endpoint, you need to start the server with the `--embedding` option added. - - **POST** `hostname:port/tokenize`: Tokenize a given text *Options:* From 5758e9f09bdb1db460c3728af4f441aa328ade74 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Fri, 2 Jun 2023 08:31:12 -0700 Subject: [PATCH 085/121] Removed embedding from flags. --- examples/server/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index d5ca24cf86ac3..334d53fa9e20e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -8,11 +8,9 @@ Command line options: - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**. - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`; - `--port`: Set the port to listen. Default: `8080`. - ## Quick Start To get started right away, run the following command, making sure to use the correct path for the model you have: From e1e2be21469d4e4aaeb7ffa2a4ba730a5d5cd2f5 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 17:47:42 -0300 Subject: [PATCH 086/121] remove --keep from help text --- examples/server/server.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5e66a7b31cf2a..ecae8ecc3c12a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -422,7 +422,6 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); From a6ed390cc6c39f5396bb9fd16b2acc605dfa02a8 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 17:48:29 -0300 Subject: [PATCH 087/121] update readme --- examples/server/README.md | 50 +++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 334d53fa9e20e..0f3fe22e8d784 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -1,14 +1,22 @@ # llama.cpp/example/server -This example allow you to have a llama.cpp http server to interact from a web page or consume the API. +This example demonstrates a simple HTTP API server to interact with llama.cpp. Command line options: -- `--threads N`, `-t N`: use N threads. +- `--threads N`, `-t N`: Set the number of threads to use during computation. - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`; +- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. +- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. +- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. +- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. +- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. +- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--port`: Set the port to listen. Default: `8080`. ## Quick Start @@ -79,10 +87,7 @@ node . ## API Endpoints -You can interact with this API Endpoints. -This implementations just support chat style interaction. - -- **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks. +- **POST** `/completion`: Given a prompt, it returns the predicted completion. *Options:* @@ -102,10 +107,35 @@ This implementations just support chat style interaction. `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. `stop`: Specify the strings that indicate a stop. - These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. - Default: `[]` + These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). + + `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). + + `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). + + `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1). + + `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). + + `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true). + + `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled). + + `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled); + + `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0). + + `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0). + + `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1). + + `seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed). + + `ignore_eos`: Ignore end of stream token and continue generating (default: false). + + `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `logit-bias: [[15043,1]]` to increase the likelihood of the token 'Hello', or `logit-bias: [[15043,-1]]` to decrease its likelihood. Setting the value to false, `logit-bias: [[15043,false]]` ensures that the token `Hello` is never produced (default: []). -- **POST** `hostname:port/tokenize`: Tokenize a given text +- **POST** `/tokenize`: Tokenize a given text. *Options:* From 05a5a485b8e41737058df1815b33e2043ad1677c Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 17:52:04 -0300 Subject: [PATCH 088/121] make help text load faster --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ecae8ecc3c12a..9b653a2f69ba3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -756,8 +756,6 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & int main(int argc, char **argv) { - llama_init_backend(); - // own arguments required by this example gpt_params params; server_params sparams; @@ -775,6 +773,8 @@ int main(int argc, char **argv) params.model_alias = params.model; } + llama_init_backend(); + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); From 98ae2de0170955dbaaa4555b28ef8a9954a74eba Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 17:54:46 -0300 Subject: [PATCH 089/121] parse --mlock and --no-mmap + format --- examples/server/server.cpp | 145 +++++++++++++++---------------------- 1 file changed, 57 insertions(+), 88 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9b653a2f69ba3..117a67826949e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -446,100 +446,73 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, "\n"); } -void server_params_parse(int argc, char **argv, server_params &sparams, gpt_params ¶ms) +void server_params_parse(int argc, char **argv, server_params &sparams, + gpt_params ¶ms) { gpt_params default_params; server_params default_sparams; std::string arg; bool invalid_param = false; - for (int i = 1; i < argc; i++) - { + for (int i = 1; i < argc; i++) { arg = argv[i]; - if (arg == "--port") - { - if (++i >= argc) - { + if (arg == "--port") { + if (++i >= argc) { invalid_param = true; break; } sparams.port = std::stoi(argv[i]); - } - else if (arg == "--host") - { - if (++i >= argc) - { + } else if (arg == "--host") { + if (++i >= argc) { invalid_param = true; break; } sparams.hostname = argv[i]; - } - else if (arg == "--timeout" || arg == "-to") - { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.read_timeout = std::stoi(argv[i]); - sparams.write_timeout = std::stoi(argv[i]); - } - else if (arg == "-m" || arg == "--model") - { - if (++i >= argc) - { + } else if (arg == "--timeout" || arg == "-to") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { invalid_param = true; break; } params.model = argv[i]; - } - else if (arg == "-a" || arg == "--alias") - { - if (++i >= argc) - { + } else if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { invalid_param = true; break; } params.model_alias = argv[i]; - } - else if (arg == "-h" || arg == "--help") - { + } else if (arg == "-h" || arg == "--help") { server_print_usage(argc, argv, default_params, default_sparams); exit(0); - } - else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") - { - if (++i >= argc) - { + } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { + if (++i >= argc) { invalid_param = true; break; } params.n_ctx = std::stoi(argv[i]); - } - else if (arg == "--memory-f32" || arg == "--memory_f32") - { + } else if (arg == "--memory-f32" || arg == "--memory_f32") { params.memory_f16 = false; - } - else if (arg == "--threads" || arg == "-t") - { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } - else if (arg == "-b" || arg == "--batch-size") - { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - params.n_batch = std::min(512, params.n_batch); - } - else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") - { - if (++i >= argc) - { + } else if (arg == "--threads" || arg == "-t") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { invalid_param = true; break; } @@ -549,37 +522,33 @@ void server_params_parse(int argc, char **argv, server_params &sparams, gpt_para fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif - } - else if (arg == "--lora") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.lora_adapter = argv[i]; - params.use_mmap = false; - } - else if (arg == "--lora-base") - { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; } else if (arg == "-v" || arg == "--verbose") { - sparams.verbose = true; - } - else - { + sparams.verbose = true; + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; + } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argc, argv, default_params, default_sparams); exit(1); } } - if (invalid_param) - { + if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); server_print_usage(argc, argv, default_params, default_sparams); exit(1); From 61befcba7ba87141e53e906e9b337e61daac8fe6 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 8 Jun 2023 22:14:43 +0300 Subject: [PATCH 090/121] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fd35218ce0c1a..94953b35e3360 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -156,7 +156,7 @@ struct llama_server_context prompt_tokens = new_tokens; } else { - size_t ps = prompt_tokens.size(); + const size_t ps = prompt_tokens.size(); std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); } @@ -744,7 +744,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); } else { - json data = {{"status", "error"}, {"reason", "You need to pass the prompt"}}; + json data = {{"status", "error"}, {"reason", "You need to provide a prompt"}}; res.set_content(data.dump(llama.json_indent), "application/json"); res.status = 400; return false; From ccd85e0a6bb2df327236ff749b348e78b7955536 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Thu, 8 Jun 2023 22:17:46 +0300 Subject: [PATCH 091/121] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- examples/server/README.md | 2 +- examples/server/chat.mjs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index fe4b50e5a29c6..d98c7a20375b5 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -37,7 +37,7 @@ To get started right away, run the following command, making sure to use the cor server.exe -m models\7B\ggml-model.bin -c 2048 ``` -That will start a server that by default listens on `127.0.0.1:8080`. +The above command will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library. ## Testing with CURL diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs index 5f1431f9d5145..349937e940a08 100644 --- a/examples/server/chat.mjs +++ b/examples/server/chat.mjs @@ -25,7 +25,7 @@ async function ChatCompletion(question) { top_p: 0.9, n_keep: 29, n_predict: 256, - stop: ["\n### Human:"], // when detect this, stop completion + stop: ["\n### Human:"], // stop completion after generating this stream: true, }) }) From a9c34779f6cc4de3836087a4935cec7eb22db49e Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Fri, 9 Jun 2023 04:47:18 -0400 Subject: [PATCH 092/121] Spaces to 4 and other code style cleanup. Notes in README. --- examples/server/README.md | 4 +- examples/server/server.cpp | 1724 ++++++++++++++++++------------------ 2 files changed, 864 insertions(+), 864 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index d98c7a20375b5..364291ee3aac4 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -23,6 +23,8 @@ Command line options: ## Quick Start +**Note:** The server is not built by default. Make sure to add `LLAMA_BUILD_SERVER=ON` to your CMake command. + To get started right away, run the following command, making sure to use the correct path for the model you have: ### Unix-based systems (Linux, macOS, etc.): @@ -99,7 +101,7 @@ node . `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). - `n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity). + `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the the limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity). `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 94953b35e3360..2cd880685b61b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -7,17 +7,17 @@ struct server_params { - std::string hostname = "127.0.0.1"; - int32_t port = 8080; - int32_t read_timeout = 600; - int32_t write_timeout = 600; - bool verbose = false; + std::string hostname = "127.0.0.1"; + int32_t port = 8080; + int32_t read_timeout = 600; + int32_t write_timeout = 600; + bool verbose = false; }; -static size_t common_part(const std::vector & a, const std::vector & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++); - return i; +static size_t common_part(const std::vector& a, const std::vector& b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++); + return i; } enum stop_type { @@ -25,13 +25,13 @@ enum stop_type { STOP_PARTIAL, }; -bool ends_with(const std::string &str, const std::string &suffix) +bool ends_with(const std::string& str, const std::string& suffix) { return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } -size_t find_partial_stop_string(const std::string &stop, const std::string &text) +size_t find_partial_stop_string(const std::string& stop, const std::string& text) { if (!text.empty()) { const char text_last_char = text.back(); @@ -47,938 +47,936 @@ size_t find_partial_stop_string(const std::string &stop, const std::string &text return std::string::npos; } -static std::string debug_str(const std::string & s) { - std::string ret; - for (size_t i = 0; s[i]; i++) { - switch (s[i]) { - case '\n': ret += "\\n"; break; - case '"': ret += "\\\""; break; - default: ret += s[i]; break; +static std::string debug_str(const std::string& s) { + std::string ret; + for (size_t i = 0; s[i]; i++) { + switch (s[i]) { + case '\n': ret += "\\n"; break; + case '"': ret += "\\\""; break; + default: ret += s[i]; break; + } } - } - return ret; + return ret; } template -static std::string tokens_to_str(llama_context * ctx, InputIt begin, OutputIt end) { - std::string ret; - for (; begin != end; (void)++begin) { - ret += llama_token_to_str(ctx, *begin); - } - return ret; +static std::string tokens_to_str(llama_context* ctx, InputIt begin, OutputIt end) { + std::string ret; + for (; begin != end; (void)++begin) { + ret += llama_token_to_str(ctx, *begin); + } + return ret; } struct llama_server_context { - bool stream = false; - bool has_next_token = false; - std::string generated_text = ""; - - size_t num_tokens_predicted = 0; - size_t n_past = 0; - size_t n_remain = 0; - - std::vector embd; - std::vector last_n_tokens; - - llama_context *ctx = nullptr; - gpt_params params; - - std::string stopping_word; - - bool verbose = false; - int json_indent = -1; - int32_t multibyte_pending = 0; - - ~llama_server_context() - { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - } - - void rewind() { - params.antiprompt.clear(); - num_tokens_predicted = 0; - generated_text = ""; - generated_text.reserve(params.n_ctx); - stopping_word = ""; - multibyte_pending = 0; - - n_remain = 0; - n_past = 0; - } - - bool loadModel(const gpt_params ¶ms_) - { - params = params_; - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) + bool stream = false; + bool has_next_token = false; + std::string generated_text = ""; + + size_t num_tokens_predicted = 0; + size_t n_past = 0; + size_t n_remain = 0; + + std::vector embd; + std::vector last_n_tokens; + + llama_context* ctx = nullptr; + gpt_params params; + + std::string stopping_word; + + bool verbose = false; + int json_indent = -1; + int32_t multibyte_pending = 0; + + ~llama_server_context() { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return false; + if (ctx) { + llama_free(ctx); + ctx = nullptr; + } } - last_n_tokens.resize(params.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - return true; - } + void rewind() { + params.antiprompt.clear(); + num_tokens_predicted = 0; + generated_text = ""; + generated_text.reserve(params.n_ctx); + stopping_word = ""; + multibyte_pending = 0; - void loadPrompt() { - params.prompt.insert(0, 1, ' '); // always add a first space - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); + n_remain = 0; + n_past = 0; + } - if (params.n_keep < 0) { - params.n_keep = (int)prompt_tokens.size(); + bool loadModel(const gpt_params& params_) + { + params = params_; + ctx = llama_init_from_gpt_params(params); + if (ctx == NULL) + { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return false; + } + + last_n_tokens.resize(params.n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + return true; } - params.n_keep = std::min(params.n_ctx - 4, params.n_keep); - // if input prompt is too big, truncate like normal - if (prompt_tokens.size() >= (size_t)params.n_ctx) { - const int n_left = (params.n_ctx - params.n_keep)/2; - std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left; - new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); - std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + void loadPrompt() { + params.prompt.insert(0, 1, ' '); // always add a first space + std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); - if (verbose) { - fprintf(stderr, - "input truncated: {\n" - " n_ctx: %d,\n" - " n_keep: %d,\n" - " n_left: %d,\n" - " new_tokens: \"%s\",\n" - "}\n", - params.n_ctx, params.n_keep, n_left, - debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); - } + if (params.n_keep < 0) { + params.n_keep = (int)prompt_tokens.size(); + } + params.n_keep = std::min(params.n_ctx - 4, params.n_keep); + + // if input prompt is too big, truncate like normal + if (prompt_tokens.size() >= (size_t)params.n_ctx) { + const int n_left = (params.n_ctx - params.n_keep) / 2; + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); + const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left; + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); + std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + + if (verbose) { + fprintf(stderr, + "input truncated: {\n" + " n_ctx: %d,\n" + " n_keep: %d,\n" + " n_left: %d,\n" + " new_tokens: \"%s\",\n" + "}\n", + params.n_ctx, params.n_keep, n_left, + debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); + } - prompt_tokens = new_tokens; - } else { - const size_t ps = prompt_tokens.size(); - std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); - std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); - } - - // compare the evaluated prompt with the new prompt - n_past = common_part(embd, prompt_tokens); - embd = prompt_tokens; - if (n_past == prompt_tokens.size()) { - // we have to evaluate at least 1 token to generate logits. - n_past--; - } - - if (verbose) { - fprintf(stderr, - "prompt: {\n" - " n_past: %zu,\n" - " cached: \"%s\",\n" - " to_eval: \"%s\",\n" - "}\n", - n_past, - debug_str(tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)).c_str(), - debug_str(tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())).c_str()); - } - - has_next_token = true; - } - - void beginCompletion() - { - // number of tokens to keep when resetting context - n_remain = params.n_predict; - llama_set_rng_seed(ctx, params.seed); - } - - llama_token nextToken() { - llama_token result = -1; - - if (embd.size() >= (size_t)params.n_ctx) { - // Reset context - const int n_left = (params.n_ctx - params.n_keep)/2; - - std::vector new_tokens(embd.begin(), embd.begin() + params.n_keep); - new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); - embd = new_tokens; - n_past = params.n_keep; - if (verbose) { - fprintf(stderr, - "input truncated: {\n" - " n_ctx: %d,\n" - " n_keep: %d,\n" - " n_left: %d,\n" - " new_tokens: \"%s\",\n" + prompt_tokens = new_tokens; + } else { + const size_t ps = prompt_tokens.size(); + std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); + std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); + } + + // compare the evaluated prompt with the new prompt + n_past = common_part(embd, prompt_tokens); + embd = prompt_tokens; + if (n_past == prompt_tokens.size()) { + // we have to evaluate at least 1 token to generate logits. + n_past--; + } + + if (verbose) { + fprintf(stderr, + "prompt: {\n" + " n_past: %zu,\n" + " cached: \"%s\",\n" + " to_eval: \"%s\",\n" "}\n", - params.n_ctx, params.n_keep, n_left, - debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); - } + n_past, + debug_str(tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)).c_str(), + debug_str(tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())).c_str()); + } + + has_next_token = true; } - while (n_past < embd.size()) + void beginCompletion() { - int n_eval = (int)embd.size() - n_past; - if (n_eval > params.n_batch) - { - n_eval = params.n_batch; - } - if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) - { - fprintf(stderr, "%s : failed to eval\n", __func__); - has_next_token = false; - return result; - } - n_past += n_eval; - } - - // out of user input, sample next token - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - llama_token id = 0; - { - auto *logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); - - // Apply params.logit_bias map - for (const auto &it : params.logit_bias) { - logits[it.first] += it.second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) - { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; - - // Apply penalties - float nl_logit = logits[llama_token_nl()]; - auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); - llama_sample_repetition_penalty(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); - if (!penalize_nl) - { - logits[llama_token_nl()] = nl_logit; - } - - if (temp <= 0) - { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } - else - { - if (mirostat == 1) - { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + // number of tokens to keep when resetting context + n_remain = params.n_predict; + llama_set_rng_seed(ctx, params.seed); + } + + llama_token nextToken() { + llama_token result = -1; + + if (embd.size() >= (size_t)params.n_ctx) { + // Reset context + const int n_left = (params.n_ctx - params.n_keep) / 2; + + std::vector new_tokens(embd.begin(), embd.begin() + params.n_keep); + new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); + embd = new_tokens; + n_past = params.n_keep; + if (verbose) { + fprintf(stderr, + "input truncated: {\n" + " n_ctx: %d,\n" + " n_keep: %d,\n" + " n_left: %d,\n" + " new_tokens: \"%s\",\n" + "}\n", + params.n_ctx, params.n_keep, n_left, + debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); + } } - else if (mirostat == 2) + + while (n_past < embd.size()) { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + int n_eval = (int)embd.size() - n_past; + if (n_eval > params.n_batch) + { + n_eval = params.n_batch; + } + if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) + { + fprintf(stderr, "%s : failed to eval\n", __func__); + has_next_token = false; + return result; + } + n_past += n_eval; } - else + + // out of user input, sample next token + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + llama_token id = 0; { - // Temperature sampling - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - num_tokens_predicted++; - } + auto* logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); - // add it to the context - embd.push_back(id); - result = id; - // decrement remaining sampling budget - --n_remain; + // Apply params.logit_bias map + for (const auto& it : params.logit_bias) { + logits[it.first] += it.second; + } - if (!embd.empty() && embd.back() == llama_token_eos()) { - stopping_word = llama_token_to_str(ctx, embd.back()); - has_next_token = false; - if (verbose) { - fprintf(stderr, "eos token found!\n"); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) + { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Apply penalties + float nl_logit = logits[llama_token_nl()]; + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); + llama_sample_repetition_penalty(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + if (!penalize_nl) + { + logits[llama_token_nl()] = nl_logit; + } + + if (temp <= 0) + { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); + } else { + if (mirostat == 1) + { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } + } + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + num_tokens_predicted++; } + + // add it to the context + embd.push_back(id); + result = id; + // decrement remaining sampling budget + --n_remain; + + if (!embd.empty() && embd.back() == llama_token_eos()) { + stopping_word = llama_token_to_str(ctx, embd.back()); + has_next_token = false; + if (verbose) { + fprintf(stderr, "eos token found!\n"); + } + return result; + } + + has_next_token = params.n_predict == -1 || n_remain != 0; return result; } - has_next_token = params.n_predict == -1 || n_remain != 0; - return result; - } - - size_t findStoppingStrings(const std::string &text, const size_t last_token_size, - const stop_type type) - { - size_t stop_pos = std::string::npos; - for (const std::string &word : params.antiprompt) { - size_t pos; - if (type == STOP_FULL) { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - pos = text.find(word, from_pos); - } else { - pos = find_partial_stop_string(word, text); - } - if (pos != std::string::npos && - (stop_pos == std::string::npos || pos < stop_pos)) { + size_t findStoppingStrings(const std::string& text, const size_t last_token_size, + const stop_type type) + { + size_t stop_pos = std::string::npos; + for (const std::string& word : params.antiprompt) { + size_t pos; if (type == STOP_FULL) { - stopping_word = word; - has_next_token = false; + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); + } + else { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) { + if (type == STOP_FULL) { + stopping_word = word; + has_next_token = false; + } + stop_pos = pos; } - stop_pos = pos; } - } - return stop_pos; - } - - std::string doCompletion() - { - llama_token token = nextToken(); - - std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); - generated_text += token_text; - - if (multibyte_pending > 0) { - multibyte_pending -= token_text.size(); - } else if (token_text.size() == 1) { - const char c = token_text[0]; - // 2-byte characters: 110xxxxx 10xxxxxx - if ((c & 0xE0) == 0xC0) { - multibyte_pending = 1; - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF0) == 0xE0) { - multibyte_pending = 2; - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF8) == 0xF0) { - multibyte_pending = 3; - } else { - multibyte_pending = 0; - } + return stop_pos; } - if (multibyte_pending > 0 && !has_next_token) { - has_next_token = true; - n_remain++; - } + std::string doCompletion() + { + llama_token token = nextToken(); + + std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); + generated_text += token_text; + + if (multibyte_pending > 0) { + multibyte_pending -= token_text.size(); + } else if (token_text.size() == 1) { + const char c = token_text[0]; + // 2-byte characters: 110xxxxx 10xxxxxx + if ((c & 0xE0) == 0xC0) { + multibyte_pending = 1; + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF0) == 0xE0) { + multibyte_pending = 2; + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF8) == 0xF0) { + multibyte_pending = 3; + } else { + multibyte_pending = 0; + } + } - if (verbose) { - fprintf(stderr, - "next token: {\n" - " token: %d,\n" - " token_text: \"%s\",\n" - " has_next_token: %d,\n" - " n_remain: %ld,\n" - " num_tokens_predicted: %ld,\n" - " stopping_word: \"%s\",\n" - "}\n", - token, debug_str(llama_token_to_str(ctx, token)).c_str(), has_next_token, n_remain, num_tokens_predicted, - debug_str(stopping_word).c_str()); - } + if (multibyte_pending > 0 && !has_next_token) { + has_next_token = true; + n_remain++; + } + + if (verbose) { + fprintf(stderr, + "next token: {\n" + " token: %d,\n" + " token_text: \"%s\",\n" + " has_next_token: %d,\n" + " n_remain: %ld,\n" + " num_tokens_predicted: %ld,\n" + " stopping_word: \"%s\",\n" + "}\n", + token, debug_str(llama_token_to_str(ctx, token)).c_str(), has_next_token, n_remain, num_tokens_predicted, + debug_str(stopping_word).c_str()); + } - return token_text; - } + return token_text; + } }; using namespace httplib; using json = nlohmann::json; -void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, const server_params &sparams) +void server_print_usage(int /*argc*/, char** argv, const gpt_params& params, const server_params& sparams) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -v, --verbose verbose output (default: false)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - if (llama_mlock_supported()) - { - fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); - } - if (llama_mmap_supported()) - { - fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); - } + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -v, --verbose verbose output (default: false)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); + if (llama_mlock_supported()) + { + fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + if (llama_mmap_supported()) + { + fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); - fprintf(stderr, " number of layers to store in VRAM\n"); - fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); + fprintf(stderr, " number of layers to store in VRAM\n"); + fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); #endif - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); - fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); - fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); - fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); - fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); - fprintf(stderr, "\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); + fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); + fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + fprintf(stderr, "\n"); } -void server_params_parse(int argc, char **argv, server_params &sparams, - gpt_params ¶ms) +void server_params_parse(int argc, char** argv, server_params& sparams, + gpt_params& params) { - gpt_params default_params; - server_params default_sparams; - std::string arg; - bool invalid_param = false; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg == "--port") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.port = std::stoi(argv[i]); - } else if (arg == "--host") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.hostname = argv[i]; - } else if (arg == "--timeout" || arg == "-to") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.read_timeout = std::stoi(argv[i]); - sparams.write_timeout = std::stoi(argv[i]); - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } else if (arg == "-a" || arg == "--alias") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } else if (arg == "-h" || arg == "--help") { - server_print_usage(argc, argv, default_params, default_sparams); - exit(0); - } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--memory-f32" || arg == "--memory_f32") { - params.memory_f16 = false; - } else if (arg == "--threads" || arg == "-t") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - params.n_batch = std::min(512, params.n_batch); - } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - break; - } + gpt_params default_params; + server_params default_sparams; + std::string arg; + bool invalid_param = false; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg == "--port") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.port = std::stoi(argv[i]); + } else if (arg == "--host") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.hostname = argv[i]; + } else if (arg == "--timeout" || arg == "-to") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } else if (arg == "-h" || arg == "--help") { + server_print_usage(argc, argv, default_params, default_sparams); + exit(0); + } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--memory-f32" || arg == "--memory_f32") { + params.memory_f16 = false; + } else if (arg == "--threads" || arg == "-t") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers = std::stoi(argv[i]); + params.n_gpu_layers = std::stoi(argv[i]); #else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif - } - else if (arg == "--tensor-split" || arg == "-ts") - { - if (++i >= argc) - { - invalid_param = true; - break; } -#ifdef GGML_USE_CUBLAS - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{ R"([,/]+)" }; - std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; - std::vector split_arg{ it, {} }; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) + else if (arg == "--tensor-split" || arg == "-ts") { - if (i < split_arg.size()) + if (++i >= argc) { - params.tensor_split[i] = std::stof(split_arg[i]); + invalid_param = true; + break; } - else +#ifdef GGML_USE_CUBLAS + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { - params.tensor_split[i] = 0.0f; + if (i < split_arg.size()) + { + params.tensor_split[i] = std::stof(split_arg[i]); + } + else + { + params.tensor_split[i] = 0.0f; + } } - } #else - fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); + fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); #endif // GGML_USE_CUBLAS - } - else if (arg == "--main-gpu" || arg == "-mg") - { - if (++i >= argc) - { - invalid_param = true; - break; } + else if (arg == "--main-gpu" || arg == "-mg") + { + if (++i >= argc) + { + invalid_param = true; + break; + } #ifdef GGML_USE_CUBLAS - params.main_gpu = std::stoi(argv[i]); + params.main_gpu = std::stoi(argv[i]); #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); #endif - } else if (arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter = argv[i]; - params.use_mmap = false; - } else if (arg == "--lora-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } else if (arg == "-v" || arg == "--verbose") { - sparams.verbose = true; - } else if (arg == "--mlock") { - params.use_mlock = true; - } else if (arg == "--no-mmap") { - params.use_mmap = false; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params, default_sparams); - exit(1); + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } else if (arg == "-v" || arg == "--verbose") { + sparams.verbose = true; + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + server_print_usage(argc, argv, default_params, default_sparams); + exit(1); + } } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params, default_sparams); - exit(1); - } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + server_print_usage(argc, argv, default_params, default_sparams); + exit(1); + } } -json format_generation_settings(llama_server_context &llama) { - const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); - const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && - eos_bias->second < 0.0f && std::isinf(eos_bias->second); - - return json { - { "seed", llama.params.seed }, - { "temp", llama.params.temp }, - { "top_k", llama.params.top_k }, - { "top_p", llama.params.top_p }, - { "tfs_z", llama.params.tfs_z }, - { "typical_p", llama.params.typical_p }, - { "repeat_last_n", llama.params.repeat_last_n }, - { "repeat_penalty", llama.params.repeat_penalty }, - { "presence_penalty", llama.params.presence_penalty }, - { "frequency_penalty", llama.params.frequency_penalty }, - { "mirostat", llama.params.mirostat }, - { "mirostat_tau", llama.params.mirostat_tau }, - { "mirostat_eta", llama.params.mirostat_eta }, - { "penalize_nl", llama.params.penalize_nl }, - { "stop", llama.params.antiprompt }, - { "n_predict", llama.params.n_predict }, - { "n_keep", llama.params.n_keep }, - { "ignore_eos", ignore_eos }, - { "stream", llama.stream }, - { "logit_bias", llama.params.logit_bias }, - }; +json format_generation_settings(llama_server_context& llama) { + const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); + const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && + eos_bias->second < 0.0f && std::isinf(eos_bias->second); + + return json{ + { "seed", llama.params.seed }, + { "temp", llama.params.temp }, + { "top_k", llama.params.top_k }, + { "top_p", llama.params.top_p }, + { "tfs_z", llama.params.tfs_z }, + { "typical_p", llama.params.typical_p }, + { "repeat_last_n", llama.params.repeat_last_n }, + { "repeat_penalty", llama.params.repeat_penalty }, + { "presence_penalty", llama.params.presence_penalty }, + { "frequency_penalty", llama.params.frequency_penalty }, + { "mirostat", llama.params.mirostat }, + { "mirostat_tau", llama.params.mirostat_tau }, + { "mirostat_eta", llama.params.mirostat_eta }, + { "penalize_nl", llama.params.penalize_nl }, + { "stop", llama.params.antiprompt }, + { "n_predict", llama.params.n_predict }, + { "n_keep", llama.params.n_keep }, + { "ignore_eos", ignore_eos }, + { "stream", llama.stream }, + { "logit_bias", llama.params.logit_bias }, + }; } -bool parse_options_completion(json body, llama_server_context& llama, Response &res) +bool parse_options_completion(json body, llama_server_context& llama, Response& res) { - gpt_params default_params; - if (!body["stream"].is_null()) { - llama.stream = body["stream"].get(); - } else { - llama.stream = false; - } - if (!body["n_predict"].is_null()) { - llama.params.n_predict = body["n_predict"].get(); - } else { - llama.params.n_predict = default_params.n_predict; - } - if (!body["top_k"].is_null()) { - llama.params.top_k = body["top_k"].get(); - } else { - llama.params.top_k = default_params.top_k; - } - if (!body["top_p"].is_null()) { - llama.params.top_p = body["top_p"].get(); - } else { - llama.params.top_p = default_params.top_p; - } - if (!body["tfs_z"].is_null()) { - llama.params.tfs_z = body["tfs_z"].get(); - } else { - llama.params.tfs_z = default_params.tfs_z; - } - if (!body["typical_p"].is_null()) { - llama.params.typical_p = body["typical_p"].get(); - } else { - llama.params.typical_p = default_params.typical_p; - } - if (!body["repeat_last_n"].is_null()) { - llama.params.repeat_last_n = body["repeat_last_n"].get(); - } else { - llama.params.repeat_last_n = default_params.repeat_last_n; - } - if (!body["temperature"].is_null()) { - llama.params.temp = body["temperature"].get(); - } else { - llama.params.temp = default_params.temp; - } - if (!body["repeat_penalty"].is_null()) { - llama.params.repeat_penalty = body["repeat_penalty"].get(); - } else { - llama.params.repeat_penalty = default_params.repeat_penalty; - } - if (!body["presence_penalty"].is_null()) { - llama.params.presence_penalty = body["presence_penalty"].get(); - } else { - llama.params.presence_penalty = default_params.presence_penalty; - } - if (!body["frequency_penalty"].is_null()) { - llama.params.frequency_penalty = body["frequency_penalty"].get(); - } else { - llama.params.frequency_penalty = default_params.frequency_penalty; - } - if (!body["mirostat"].is_null()) { - llama.params.mirostat = body["mirostat"].get(); - } else { - llama.params.mirostat = default_params.mirostat; - } - if (!body["mirostat_tau"].is_null()) { - llama.params.mirostat_tau = body["mirostat_tau"].get(); - } else { - llama.params.mirostat_tau = default_params.mirostat_tau; - } - if (!body["mirostat_eta"].is_null()) { - llama.params.mirostat_eta = body["mirostat_eta"].get(); - } else { - llama.params.mirostat_eta = default_params.mirostat_eta; - } - if (!body["penalize_nl"].is_null()) { - llama.params.penalize_nl = body["penalize_nl"].get(); - } else { - llama.params.penalize_nl = default_params.penalize_nl; - } - if (!body["n_keep"].is_null()) { - llama.params.n_keep = body["n_keep"].get(); - } else { - llama.params.n_keep = default_params.n_keep; - } - if (!body["seed"].is_null()) { - llama.params.seed = body["seed"].get(); - } else { - llama.params.seed = time(NULL); - } - - llama.params.logit_bias.clear(); - if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) { - llama.params.logit_bias[llama_token_eos()] = -INFINITY; - } - if (body["logit_bias"].is_array()) { - int n_vocab = llama_n_vocab(llama.ctx); - for (const auto &el : body["logit_bias"]) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - if (el[1].is_number_float()) { - llama.params.logit_bias[tok] = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - llama.params.logit_bias[tok] = -INFINITY; - } + gpt_params default_params; + if (!body["stream"].is_null()) { + llama.stream = body["stream"].get(); + } else { + llama.stream = false; + } + if (!body["n_predict"].is_null()) { + llama.params.n_predict = body["n_predict"].get(); + } else { + llama.params.n_predict = default_params.n_predict; + } + if (!body["top_k"].is_null()) { + llama.params.top_k = body["top_k"].get(); + } else { + llama.params.top_k = default_params.top_k; + } + if (!body["top_p"].is_null()) { + llama.params.top_p = body["top_p"].get(); + } else { + llama.params.top_p = default_params.top_p; + } + if (!body["tfs_z"].is_null()) { + llama.params.tfs_z = body["tfs_z"].get(); + } else { + llama.params.tfs_z = default_params.tfs_z; + } + if (!body["typical_p"].is_null()) { + llama.params.typical_p = body["typical_p"].get(); + } else { + llama.params.typical_p = default_params.typical_p; + } + if (!body["repeat_last_n"].is_null()) { + llama.params.repeat_last_n = body["repeat_last_n"].get(); + } else { + llama.params.repeat_last_n = default_params.repeat_last_n; + } + if (!body["temperature"].is_null()) { + llama.params.temp = body["temperature"].get(); + } else { + llama.params.temp = default_params.temp; + } + if (!body["repeat_penalty"].is_null()) { + llama.params.repeat_penalty = body["repeat_penalty"].get(); + } else { + llama.params.repeat_penalty = default_params.repeat_penalty; + } + if (!body["presence_penalty"].is_null()) { + llama.params.presence_penalty = body["presence_penalty"].get(); + } else { + llama.params.presence_penalty = default_params.presence_penalty; + } + if (!body["frequency_penalty"].is_null()) { + llama.params.frequency_penalty = body["frequency_penalty"].get(); + } else { + llama.params.frequency_penalty = default_params.frequency_penalty; + } + if (!body["mirostat"].is_null()) { + llama.params.mirostat = body["mirostat"].get(); + } else { + llama.params.mirostat = default_params.mirostat; + } + if (!body["mirostat_tau"].is_null()) { + llama.params.mirostat_tau = body["mirostat_tau"].get(); + } else { + llama.params.mirostat_tau = default_params.mirostat_tau; + } + if (!body["mirostat_eta"].is_null()) { + llama.params.mirostat_eta = body["mirostat_eta"].get(); + } else { + llama.params.mirostat_eta = default_params.mirostat_eta; + } + if (!body["penalize_nl"].is_null()) { + llama.params.penalize_nl = body["penalize_nl"].get(); + } else { + llama.params.penalize_nl = default_params.penalize_nl; + } + if (!body["n_keep"].is_null()) { + llama.params.n_keep = body["n_keep"].get(); + } else { + llama.params.n_keep = default_params.n_keep; + } + if (!body["seed"].is_null()) { + llama.params.seed = body["seed"].get(); + } else { + llama.params.seed = time(NULL); + } + + llama.params.logit_bias.clear(); + if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) { + llama.params.logit_bias[llama_token_eos()] = -INFINITY; + } + if (body["logit_bias"].is_array()) { + int n_vocab = llama_n_vocab(llama.ctx); + for (const auto& el : body["logit_bias"]) { + if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { + if (el[1].is_number_float()) { + llama.params.logit_bias[tok] = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + llama.params.logit_bias[tok] = -INFINITY; + } + } + } } - } - } - } - - if (!body["prompt"].is_null()) { - llama.params.prompt = body["prompt"].get(); - } else { - json data = {{"status", "error"}, {"reason", "You need to provide a prompt"}}; - res.set_content(data.dump(llama.json_indent), "application/json"); - res.status = 400; - return false; - } - - llama.params.antiprompt.clear(); - if (!body["stop"].is_null()) { - const auto stop = body["stop"].get>(); - std::copy_if(stop.begin(), stop.end(), - std::back_inserter(llama.params.antiprompt), - [](const std::string &str) { return !str.empty(); }); - } - - if (llama.verbose) { - json tmp = format_generation_settings(llama); - fprintf(stderr, + } + + if (!body["prompt"].is_null()) { + llama.params.prompt = body["prompt"].get(); + } else { + json data = { {"status", "error"}, {"reason", "You need to provide a prompt"} }; + res.set_content(data.dump(llama.json_indent), "application/json"); + res.status = 400; + return false; + } + + llama.params.antiprompt.clear(); + if (!body["stop"].is_null()) { + const auto stop = body["stop"].get>(); + std::copy_if(stop.begin(), stop.end(), + std::back_inserter(llama.params.antiprompt), + [](const std::string& str) { return !str.empty(); }); + } + + if (llama.verbose) { + json tmp = format_generation_settings(llama); + fprintf(stderr, "-------------------------\n" "completion parameters: %s\n" "full prompt: \"%s\"\n", tmp.dump(4, ' ', false, json::error_handler_t::replace).c_str(), debug_str(llama.params.prompt).c_str()); - } + } - return true; + return true; } -int main(int argc, char **argv) +int main(int argc, char** argv) { - // own arguments required by this example - gpt_params params; - server_params sparams; - - // struct that contains llama context and inference - llama_server_context llama; - params.model = "ggml-model.bin"; - - server_params_parse(argc, argv, sparams, params); - - llama.verbose = sparams.verbose; - llama.json_indent = sparams.verbose ? 4 : -1; - - if (params.model_alias == "unknown") { - params.model_alias = params.model; - } - - llama_init_backend(); - - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, - std::thread::hardware_concurrency(), llama_print_system_info()); - - // load the model - if (!llama.loadModel(params)) - { - return 1; - } - - Server svr; - - svr.set_default_headers({ - {"Access-Control-Allow-Origin", "*"}, - {"Access-Control-Allow-Headers", "content-type"} - }); - - svr.Get("/", [](const Request &, Response &res) - { res.set_content("

llama.cpp server works

", "text/html"); }); - - svr.Post("/completion", [&llama](const Request &req, Response &res) { - - llama.rewind(); - llama_reset_timings(llama.ctx); - - if (!parse_options_completion(json::parse(req.body), llama, res)) { - return; - } - - llama.loadPrompt(); - llama.beginCompletion(); - - if (!llama.stream) { - size_t stop_pos = std::string::npos; - - while (llama.has_next_token) { - const std::string token_text = llama.doCompletion(); - - stop_pos = llama.findStoppingStrings(llama.generated_text, - token_text.size(), STOP_FULL); - } - - if (stop_pos == std::string::npos) { - stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); - } - if (stop_pos != std::string::npos) { - llama.generated_text.erase(llama.generated_text.begin() + stop_pos, - llama.generated_text.end()); - } - - json data = {{"content", llama.generated_text}, - {"stop", true}, - {"model", llama.params.model_alias}, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word}}; - - llama_print_timings(llama.ctx); - - res.set_content( - data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), - "application/json"); - } else { - const auto chunked_content_provider = [&](size_t, DataSink &sink) { - size_t sent_count = 0; - - while (llama.has_next_token) { - const std::string token_text = llama.doCompletion(); - if (llama.multibyte_pending > 0) { - continue; - } - - size_t pos = std::min(sent_count, llama.generated_text.size()); - - const char *str_test = llama.generated_text.c_str() + pos; - size_t stop_pos = - llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); - if (stop_pos != std::string::npos) { - llama.generated_text.erase( - llama.generated_text.begin() + pos + stop_pos, - llama.generated_text.end()); - pos = std::min(sent_count, llama.generated_text.size()); - } else { - stop_pos = llama.findStoppingStrings(str_test, token_text.size(), - STOP_PARTIAL); - } - - std::string to_send = llama.generated_text.substr(pos, stop_pos); - sent_count += to_send.size(); - - json data; - if (llama.has_next_token) { - data = {{"content", to_send}, {"stop", false}}; - } else { - // Generation is done, send extra information. - data = { - {"content", to_send}, - {"stop", true}, - {"model", llama.params.model_alias}, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word}, - {"generated_text", llama.generated_text}}; - } - - std::string str = - "data: " + - data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false, - json::error_handler_t::replace) + - "\n\n"; - - if (llama.verbose) { - fprintf(stderr, "to_send=%s", str.c_str()); - } - - if (!sink.write(str.data(), str.size())) { - if (llama.verbose) { - fprintf(stderr, "stream closed\n"); - } - llama_print_timings(llama.ctx); - return false; - } - } - - llama_print_timings(llama.ctx); - sink.done(); - return true; - }; - res.set_chunked_content_provider("text/event-stream", chunked_content_provider); - } - }); - - svr.Options(R"(/.*)", [](const Request &, Response &res) - { - return res.set_content("", "application/json"); - }); + // own arguments required by this example + gpt_params params; + server_params sparams; - svr.Post("/tokenize", [&llama](const Request &req, Response &res) - { - json body = json::parse(req.body); - json data = { - {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; - return res.set_content(data.dump(llama.json_indent), "application/json"); - }); - - svr.set_logger([](const Request& req, const Response& res) { - json log = { - { "status", res.status }, - { "path", req.path }, - { "request", req.body }, - { "response", res.body }, - }; - fprintf(stdout, "http_request: %s\n", - log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); - }); - - svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { - const auto *fmt = "500 Internal Server Error\n%s"; - char buf[BUFSIZ]; - try { - std::rethrow_exception(std::move(ep)); - } catch (std::exception &e) { - snprintf(buf, sizeof(buf), fmt, e.what()); - } catch (...) { - snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); - } - res.set_content(buf, "text/plain"); - res.status = 500; - }); - - // set timeouts and change hostname and port - svr.set_read_timeout(sparams.read_timeout); - svr.set_write_timeout(sparams.write_timeout); - - if (!svr.bind_to_port(sparams.hostname, sparams.port)) { - fprintf(stderr, "%s: ERROR: couldn't bind server to %s:%i\n", __func__, - sparams.hostname.c_str(), sparams.port); - return 1; - } - - fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, - sparams.hostname.c_str(), sparams.port); - if (!svr.listen_after_bind()) { - return 1; - } - - return 0; + // struct that contains llama context and inference + llama_server_context llama; + params.model = "ggml-model.bin"; + + server_params_parse(argc, argv, sparams, params); + + llama.verbose = sparams.verbose; + llama.json_indent = sparams.verbose ? 4 : -1; + + if (params.model_alias == "unknown") { + params.model_alias = params.model; + } + + llama_init_backend(); + + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, + std::thread::hardware_concurrency(), llama_print_system_info()); + + // load the model + if (!llama.loadModel(params)) + { + return 1; + } + + Server svr; + + svr.set_default_headers({ + {"Access-Control-Allow-Origin", "*"}, + {"Access-Control-Allow-Headers", "content-type"} + }); + + svr.Get("/", [](const Request&, Response& res) + { res.set_content("

llama.cpp server works

", "text/html"); }); + + svr.Post("/completion", [&llama](const Request& req, Response& res) { + + llama.rewind(); + llama_reset_timings(llama.ctx); + + if (!parse_options_completion(json::parse(req.body), llama, res)) { + return; + } + + llama.loadPrompt(); + llama.beginCompletion(); + + if (!llama.stream) { + size_t stop_pos = std::string::npos; + + while (llama.has_next_token) { + const std::string token_text = llama.doCompletion(); + + stop_pos = llama.findStoppingStrings(llama.generated_text, + token_text.size(), STOP_FULL); + } + + if (stop_pos == std::string::npos) { + stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); + } + if (stop_pos != std::string::npos) { + llama.generated_text.erase(llama.generated_text.begin() + stop_pos, + llama.generated_text.end()); + } + + json data = { {"content", llama.generated_text}, + {"stop", true}, + {"model", llama.params.model_alias}, + {"tokens_predicted", llama.num_tokens_predicted}, + {"generation_settings", format_generation_settings(llama)}, + {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word} }; + + llama_print_timings(llama.ctx); + + res.set_content( + data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), + "application/json"); + } + else { + const auto chunked_content_provider = [&](size_t, DataSink& sink) { + size_t sent_count = 0; + + while (llama.has_next_token) { + const std::string token_text = llama.doCompletion(); + if (llama.multibyte_pending > 0) { + continue; + } + + size_t pos = std::min(sent_count, llama.generated_text.size()); + + const char* str_test = llama.generated_text.c_str() + pos; + size_t stop_pos = + llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); + if (stop_pos != std::string::npos) { + llama.generated_text.erase( + llama.generated_text.begin() + pos + stop_pos, + llama.generated_text.end()); + pos = std::min(sent_count, llama.generated_text.size()); + } else { + stop_pos = llama.findStoppingStrings(str_test, token_text.size(), + STOP_PARTIAL); + } + + std::string to_send = llama.generated_text.substr(pos, stop_pos); + sent_count += to_send.size(); + + json data; + if (llama.has_next_token) { + data = { {"content", to_send}, {"stop", false} }; + } else { + // Generation is done, send extra information. + data = { + {"content", to_send}, + {"stop", true}, + {"model", llama.params.model_alias}, + {"tokens_predicted", llama.num_tokens_predicted}, + {"generation_settings", format_generation_settings(llama)}, + {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word}, + {"generated_text", llama.generated_text} }; + } + + std::string str = + "data: " + + data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false, + json::error_handler_t::replace) + + "\n\n"; + + if (llama.verbose) { + fprintf(stderr, "to_send=%s", str.c_str()); + } + + if (!sink.write(str.data(), str.size())) { + if (llama.verbose) { + fprintf(stderr, "stream closed\n"); + } + llama_print_timings(llama.ctx); + return false; + } + } + + llama_print_timings(llama.ctx); + sink.done(); + return true; + }; + res.set_chunked_content_provider("text/event-stream", chunked_content_provider); + } + }); + + svr.Options(R"(/.*)", [](const Request&, Response& res) + { + return res.set_content("", "application/json"); + }); + + svr.Post("/tokenize", [&llama](const Request& req, Response& res) + { + json body = json::parse(req.body); + json data = { + {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; + return res.set_content(data.dump(llama.json_indent), "application/json"); + }); + + svr.set_logger([](const Request& req, const Response& res) { + json log = { + { "status", res.status }, + { "path", req.path }, + { "request", req.body }, + { "response", res.body }, + }; + fprintf(stdout, "http_request: %s\n", + log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); + }); + + svr.set_exception_handler([](const Request&, Response& res, std::exception_ptr ep) { + const auto* fmt = "500 Internal Server Error\n%s"; + char buf[BUFSIZ]; + try { + std::rethrow_exception(std::move(ep)); + } + catch (std::exception& e) { + snprintf(buf, sizeof(buf), fmt, e.what()); + } + catch (...) { + snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); + } + res.set_content(buf, "text/plain"); + res.status = 500; + }); + + // set timeouts and change hostname and port + svr.set_read_timeout(sparams.read_timeout); + svr.set_write_timeout(sparams.write_timeout); + + if (!svr.bind_to_port(sparams.hostname, sparams.port)) { + fprintf(stderr, "%s: ERROR: couldn't bind server to %s:%i\n", __func__, + sparams.hostname.c_str(), sparams.port); + return 1; + } + + fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, + sparams.hostname.c_str(), sparams.port); + if (!svr.listen_after_bind()) { + return 1; + } + + return 0; } From cc2b33649d386d4d7b9cd6b83a4e9c874332d1dd Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Fri, 9 Jun 2023 04:50:31 -0400 Subject: [PATCH 093/121] Missed a pair of catch statements for formatting. --- examples/server/server.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2cd880685b61b..7e3b5fdf402de 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -951,11 +951,9 @@ int main(int argc, char** argv) char buf[BUFSIZ]; try { std::rethrow_exception(std::move(ep)); - } - catch (std::exception& e) { + } catch (std::exception& e) { snprintf(buf, sizeof(buf), fmt, e.what()); - } - catch (...) { + } catch (...) { snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); } res.set_content(buf, "text/plain"); From 7580427837e74393a0bfb87ee1bf733ae59cdf93 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Fri, 9 Jun 2023 04:56:31 -0400 Subject: [PATCH 094/121] Resolving some review comments --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7e3b5fdf402de..46b17ed08078b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -16,7 +16,7 @@ struct server_params static size_t common_part(const std::vector& a, const std::vector& b) { size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++); + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; } @@ -33,7 +33,7 @@ bool ends_with(const std::string& str, const std::string& suffix) size_t find_partial_stop_string(const std::string& stop, const std::string& text) { - if (!text.empty()) { + if (!text.empty() && !stop.empty()) { const char text_last_char = text.back(); for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { if (stop[char_index] == text_last_char) { From 7cdeb0848302bf12677264f0ec037b72e4fd67d9 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Fri, 9 Jun 2023 05:12:16 -0400 Subject: [PATCH 095/121] More formatting cleanup --- examples/server/server.cpp | 52 +++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 46b17ed08078b..afe0cc03cf189 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -14,7 +14,7 @@ struct server_params bool verbose = false; }; -static size_t common_part(const std::vector& a, const std::vector& b) { +static size_t common_part(const std::vector & a, const std::vector & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; @@ -25,13 +25,13 @@ enum stop_type { STOP_PARTIAL, }; -bool ends_with(const std::string& str, const std::string& suffix) +bool ends_with(const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } -size_t find_partial_stop_string(const std::string& stop, const std::string& text) +size_t find_partial_stop_string(const std::string & stop, const std::string & text) { if (!text.empty() && !stop.empty()) { const char text_last_char = text.back(); @@ -47,7 +47,7 @@ size_t find_partial_stop_string(const std::string& stop, const std::string& text return std::string::npos; } -static std::string debug_str(const std::string& s) { +static std::string debug_str(const std::string & s) { std::string ret; for (size_t i = 0; s[i]; i++) { switch (s[i]) { @@ -60,7 +60,7 @@ static std::string debug_str(const std::string& s) { } template -static std::string tokens_to_str(llama_context* ctx, InputIt begin, OutputIt end) { +static std::string tokens_to_str(llama_context * ctx, InputIt begin, OutputIt end) { std::string ret; for (; begin != end; (void)++begin) { ret += llama_token_to_str(ctx, *begin); @@ -81,7 +81,7 @@ struct llama_server_context std::vector embd; std::vector last_n_tokens; - llama_context* ctx = nullptr; + llama_context * ctx = nullptr; gpt_params params; std::string stopping_word; @@ -110,7 +110,7 @@ struct llama_server_context n_past = 0; } - bool loadModel(const gpt_params& params_) + bool loadModel(const gpt_params & params_) { params = params_; ctx = llama_init_from_gpt_params(params); @@ -247,11 +247,11 @@ struct llama_server_context const bool penalize_nl = params.penalize_nl; llama_token id = 0; { - auto* logits = llama_get_logits(ctx); + auto * logits = llama_get_logits(ctx); auto n_vocab = llama_n_vocab(ctx); // Apply params.logit_bias map - for (const auto& it : params.logit_bias) { + for (const auto & it : params.logit_bias) { logits[it.first] += it.second; } @@ -327,11 +327,11 @@ struct llama_server_context return result; } - size_t findStoppingStrings(const std::string& text, const size_t last_token_size, + size_t findStoppingStrings(const std::string & text, const size_t last_token_size, const stop_type type) { size_t stop_pos = std::string::npos; - for (const std::string& word : params.antiprompt) { + for (const std::string & word : params.antiprompt) { size_t pos; if (type == STOP_FULL) { const size_t tmp = word.size() + last_token_size; @@ -405,7 +405,7 @@ using namespace httplib; using json = nlohmann::json; -void server_print_usage(int /*argc*/, char** argv, const gpt_params& params, const server_params& sparams) +void server_print_usage(int /*argc*/, char ** argv, const gpt_params & params, const server_params & sparams) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); @@ -445,8 +445,8 @@ void server_print_usage(int /*argc*/, char** argv, const gpt_params& params, con fprintf(stderr, "\n"); } -void server_params_parse(int argc, char** argv, server_params& sparams, - gpt_params& params) +void server_params_parse(int argc, char ** argv, server_params & sparams, + gpt_params & params) { gpt_params default_params; server_params default_sparams; @@ -598,7 +598,7 @@ void server_params_parse(int argc, char** argv, server_params& sparams, } } -json format_generation_settings(llama_server_context& llama) { +json format_generation_settings(llama_server_context & llama) { const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); @@ -627,7 +627,7 @@ json format_generation_settings(llama_server_context& llama) { }; } -bool parse_options_completion(json body, llama_server_context& llama, Response& res) +bool parse_options_completion(json body, llama_server_context & llama, Response & res) { gpt_params default_params; if (!body["stream"].is_null()) { @@ -722,7 +722,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response& } if (body["logit_bias"].is_array()) { int n_vocab = llama_n_vocab(llama.ctx); - for (const auto& el : body["logit_bias"]) { + for (const auto & el : body["logit_bias"]) { if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { llama_token tok = el[0].get(); if (tok >= 0 && tok < n_vocab) { @@ -750,7 +750,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response& const auto stop = body["stop"].get>(); std::copy_if(stop.begin(), stop.end(), std::back_inserter(llama.params.antiprompt), - [](const std::string& str) { return !str.empty(); }); + [](const std::string & str) { return !str.empty(); }); } if (llama.verbose) { @@ -766,7 +766,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response& return true; } -int main(int argc, char** argv) +int main(int argc, char ** argv) { // own arguments required by this example gpt_params params; @@ -804,10 +804,10 @@ int main(int argc, char** argv) {"Access-Control-Allow-Headers", "content-type"} }); - svr.Get("/", [](const Request&, Response& res) + svr.Get("/", [](const Request &, Response & res) { res.set_content("

llama.cpp server works

", "text/html"); }); - svr.Post("/completion", [&llama](const Request& req, Response& res) { + svr.Post("/completion", [&llama](const Request & req, Response & res) { llama.rewind(); llama_reset_timings(llama.ctx); @@ -922,12 +922,12 @@ int main(int argc, char** argv) } }); - svr.Options(R"(/.*)", [](const Request&, Response& res) + svr.Options(R"(/.*)", [](const Request &, Response & res) { return res.set_content("", "application/json"); }); - svr.Post("/tokenize", [&llama](const Request& req, Response& res) + svr.Post("/tokenize", [&llama](const Request & req, Response & res) { json body = json::parse(req.body); json data = { @@ -935,7 +935,7 @@ int main(int argc, char** argv) return res.set_content(data.dump(llama.json_indent), "application/json"); }); - svr.set_logger([](const Request& req, const Response& res) { + svr.set_logger([](const Request & req, const Response & res) { json log = { { "status", res.status }, { "path", req.path }, @@ -946,8 +946,8 @@ int main(int argc, char** argv) log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); }); - svr.set_exception_handler([](const Request&, Response& res, std::exception_ptr ep) { - const auto* fmt = "500 Internal Server Error\n%s"; + svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) { + const auto * fmt = "500 Internal Server Error\n%s"; char buf[BUFSIZ]; try { std::rethrow_exception(std::move(ep)); From 1a9141b6c3101ce8a7f54a2466816fee6bdfa810 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Fri, 9 Jun 2023 16:29:10 -0400 Subject: [PATCH 096/121] Remove model assign in main(). Clarified stop in README. The model will now load the default from gptparams ("models/7B/ggml-model.bin") --- examples/server/README.md | 2 +- examples/server/server.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 364291ee3aac4..4ff4f5f6d9e9d 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -110,7 +110,7 @@ node . `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. - `stop`: Specify the strings that indicate a stop. + `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). diff --git a/examples/server/server.cpp b/examples/server/server.cpp index afe0cc03cf189..ee645c1430a88 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -774,7 +774,6 @@ int main(int argc, char ** argv) // struct that contains llama context and inference llama_server_context llama; - params.model = "ggml-model.bin"; server_params_parse(argc, argv, sparams, params); From 917540ce43ec7142d3edfdac53851941e4d257f3 Mon Sep 17 00:00:00 2001 From: Lesaun Harvey Date: Fri, 9 Jun 2023 19:06:09 -0700 Subject: [PATCH 097/121] Clarify build instructions in README. --- examples/server/README.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 4ff4f5f6d9e9d..56399be291c6c 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -21,9 +21,26 @@ Command line options: - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--port`: Set the port to listen. Default: `8080`. -## Quick Start +## Build + +Build llama.cpp with server from repository root with either make or CMake. + +- Using `make`: + + ```bash + LLAMA_BUILD_SERVER=1 make + ``` -**Note:** The server is not built by default. Make sure to add `LLAMA_BUILD_SERVER=ON` to your CMake command. +- Using `CMake`: + + ```bash + mkdir build-server + cd build-server + cmake -DLLAMA_BUILD_SERVER=ON .. + cmake --build . --config Release + ``` + +## Quick Start To get started right away, run the following command, making sure to use the correct path for the model you have: From 2c00bf855da5b5c84f5ee6baf4749c442ff33713 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 11 Jun 2023 14:01:42 +0300 Subject: [PATCH 098/121] more formatting changes --- examples/server/server.cpp | 223 ++++++++++++++++--------------------- 1 file changed, 99 insertions(+), 124 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ee645c1430a88..209db251568f9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,8 +5,7 @@ #include "httplib.h" #include "json.hpp" -struct server_params -{ +struct server_params { std::string hostname = "127.0.0.1"; int32_t port = 8080; int32_t read_timeout = 600; @@ -25,14 +24,12 @@ enum stop_type { STOP_PARTIAL, }; -bool ends_with(const std::string & str, const std::string & suffix) -{ +bool ends_with(const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } -size_t find_partial_stop_string(const std::string & stop, const std::string & text) -{ +size_t find_partial_stop_string(const std::string & stop, const std::string & text) { if (!text.empty() && !stop.empty()) { const char text_last_char = text.back(); for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { @@ -59,8 +56,8 @@ static std::string debug_str(const std::string & s) { return ret; } -template -static std::string tokens_to_str(llama_context * ctx, InputIt begin, OutputIt end) { +template +static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; (void)++begin) { ret += llama_token_to_str(ctx, *begin); @@ -68,8 +65,7 @@ static std::string tokens_to_str(llama_context * ctx, InputIt begin, OutputIt en return ret; } -struct llama_server_context -{ +struct llama_server_context { bool stream = false; bool has_next_token = false; std::string generated_text = ""; @@ -90,8 +86,7 @@ struct llama_server_context int json_indent = -1; int32_t multibyte_pending = 0; - ~llama_server_context() - { + ~llama_server_context() { if (ctx) { llama_free(ctx); ctx = nullptr; @@ -110,12 +105,10 @@ struct llama_server_context n_past = 0; } - bool loadModel(const gpt_params & params_) - { + bool loadModel(const gpt_params & params_) { params = params_; ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) - { + if (ctx == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return false; } @@ -184,8 +177,7 @@ struct llama_server_context has_next_token = true; } - void beginCompletion() - { + void beginCompletion() { // number of tokens to keep when resetting context n_remain = params.n_predict; llama_set_rng_seed(ctx, params.seed); @@ -215,15 +207,12 @@ struct llama_server_context } } - while (n_past < embd.size()) - { + while (n_past < embd.size()) { int n_eval = (int)embd.size() - n_past; - if (n_eval > params.n_batch) - { + if (n_eval > params.n_batch) { n_eval = params.n_batch; } - if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) - { + if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) { fprintf(stderr, "%s : failed to eval\n", __func__); has_next_token = false; return result; @@ -245,8 +234,7 @@ struct llama_server_context const float mirostat_tau = params.mirostat_tau; const float mirostat_eta = params.mirostat_eta; const bool penalize_nl = params.penalize_nl; - llama_token id = 0; - { + llama_token id = 0; { auto * logits = llama_get_logits(ctx); auto n_vocab = llama_n_vocab(ctx); @@ -257,8 +245,7 @@ struct llama_server_context std::vector candidates; candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) - { + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); } @@ -273,18 +260,15 @@ struct llama_server_context llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_repeat, alpha_frequency, alpha_presence); - if (!penalize_nl) - { + if (!penalize_nl) { logits[llama_token_nl()] = nl_logit; } - if (temp <= 0) - { + if (temp <= 0) { // Greedy sampling id = llama_sample_token_greedy(ctx, &candidates_p); } else { - if (mirostat == 1) - { + if (mirostat == 1) { static float mirostat_mu = 2.0f * mirostat_tau; const int mirostat_m = 100; llama_sample_temperature(ctx, &candidates_p, temp); @@ -328,8 +312,7 @@ struct llama_server_context } size_t findStoppingStrings(const std::string & text, const size_t last_token_size, - const stop_type type) - { + const stop_type type) { size_t stop_pos = std::string::npos; for (const std::string & word : params.antiprompt) { size_t pos; @@ -353,8 +336,7 @@ struct llama_server_context return stop_pos; } - std::string doCompletion() - { + std::string doCompletion() { llama_token token = nextToken(); std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); @@ -405,8 +387,7 @@ using namespace httplib; using json = nlohmann::json; -void server_print_usage(int /*argc*/, char ** argv, const gpt_params & params, const server_params & sparams) -{ +void server_print_usage(int /*argc*/, char ** argv, const gpt_params & params, const server_params & sparams) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); @@ -417,12 +398,10 @@ void server_print_usage(int /*argc*/, char ** argv, const gpt_params & params, c fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - if (llama_mlock_supported()) - { + if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } - if (llama_mmap_supported()) - { + if (llama_mmap_supported()) { fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD @@ -446,8 +425,7 @@ void server_print_usage(int /*argc*/, char ** argv, const gpt_params & params, c } void server_params_parse(int argc, char ** argv, server_params & sparams, - gpt_params & params) -{ + gpt_params & params) { gpt_params default_params; server_params default_sparams; std::string arg; @@ -522,10 +500,8 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif } - else if (arg == "--tensor-split" || arg == "-ts") - { - if (++i >= argc) - { + else if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { invalid_param = true; break; } @@ -538,14 +514,11 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, std::vector split_arg{ it, {} }; GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) - { - if (i < split_arg.size()) - { + for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + if (i < split_arg.size()) { params.tensor_split[i] = std::stof(split_arg[i]); } - else - { + else { params.tensor_split[i] = 0.0f; } } @@ -553,10 +526,8 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); #endif // GGML_USE_CUBLAS } - else if (arg == "--main-gpu" || arg == "-mg") - { - if (++i >= argc) - { + else if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { invalid_param = true; break; } @@ -603,32 +574,31 @@ json format_generation_settings(llama_server_context & llama) { const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); - return json{ - { "seed", llama.params.seed }, - { "temp", llama.params.temp }, - { "top_k", llama.params.top_k }, - { "top_p", llama.params.top_p }, - { "tfs_z", llama.params.tfs_z }, - { "typical_p", llama.params.typical_p }, - { "repeat_last_n", llama.params.repeat_last_n }, - { "repeat_penalty", llama.params.repeat_penalty }, - { "presence_penalty", llama.params.presence_penalty }, - { "frequency_penalty", llama.params.frequency_penalty }, - { "mirostat", llama.params.mirostat }, - { "mirostat_tau", llama.params.mirostat_tau }, - { "mirostat_eta", llama.params.mirostat_eta }, - { "penalize_nl", llama.params.penalize_nl }, - { "stop", llama.params.antiprompt }, - { "n_predict", llama.params.n_predict }, - { "n_keep", llama.params.n_keep }, - { "ignore_eos", ignore_eos }, - { "stream", llama.stream }, - { "logit_bias", llama.params.logit_bias }, + return json { + { "seed", llama.params.seed }, + { "temp", llama.params.temp }, + { "top_k", llama.params.top_k }, + { "top_p", llama.params.top_p }, + { "tfs_z", llama.params.tfs_z }, + { "typical_p", llama.params.typical_p }, + { "repeat_last_n", llama.params.repeat_last_n }, + { "repeat_penalty", llama.params.repeat_penalty }, + { "presence_penalty", llama.params.presence_penalty }, + { "frequency_penalty", llama.params.frequency_penalty }, + { "mirostat", llama.params.mirostat }, + { "mirostat_tau", llama.params.mirostat_tau }, + { "mirostat_eta", llama.params.mirostat_eta }, + { "penalize_nl", llama.params.penalize_nl }, + { "stop", llama.params.antiprompt }, + { "n_predict", llama.params.n_predict }, + { "n_keep", llama.params.n_keep }, + { "ignore_eos", ignore_eos }, + { "stream", llama.stream }, + { "logit_bias", llama.params.logit_bias }, }; } -bool parse_options_completion(json body, llama_server_context & llama, Response & res) -{ +bool parse_options_completion(json body, llama_server_context & llama, Response & res) { gpt_params default_params; if (!body["stream"].is_null()) { llama.stream = body["stream"].get(); @@ -766,8 +736,7 @@ bool parse_options_completion(json body, llama_server_context & llama, Response return true; } -int main(int argc, char ** argv) -{ +int main(int argc, char ** argv) { // own arguments required by this example gpt_params params; server_params sparams; @@ -791,20 +760,20 @@ int main(int argc, char ** argv) std::thread::hardware_concurrency(), llama_print_system_info()); // load the model - if (!llama.loadModel(params)) - { + if (!llama.loadModel(params)) { return 1; } Server svr; svr.set_default_headers({ - {"Access-Control-Allow-Origin", "*"}, - {"Access-Control-Allow-Headers", "content-type"} - }); + { "Access-Control-Allow-Origin", "*" }, + { "Access-Control-Allow-Headers", "content-type" } + }); - svr.Get("/", [](const Request &, Response & res) - { res.set_content("

llama.cpp server works

", "text/html"); }); + svr.Get("/", [](const Request &, Response & res) { + res.set_content("

llama.cpp server works

", "text/html"); + }); svr.Post("/completion", [&llama](const Request & req, Response & res) { @@ -836,13 +805,15 @@ int main(int argc, char ** argv) llama.generated_text.end()); } - json data = { {"content", llama.generated_text}, - {"stop", true}, - {"model", llama.params.model_alias}, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word} }; + json data { + { "content", llama.generated_text }, + { "stop", true }, + { "model", llama.params.model_alias }, + { "tokens_predicted", llama.num_tokens_predicted }, + { "generation_settings", format_generation_settings(llama) }, + { "prompt", llama.params.prompt }, + { "stopping_word", llama.stopping_word }, + }; llama_print_timings(llama.ctx); @@ -851,7 +822,7 @@ int main(int argc, char ** argv) "application/json"); } else { - const auto chunked_content_provider = [&](size_t, DataSink& sink) { + const auto chunked_content_provider = [&](size_t, DataSink & sink) { size_t sent_count = 0; while (llama.has_next_token) { @@ -880,18 +851,22 @@ int main(int argc, char ** argv) json data; if (llama.has_next_token) { - data = { {"content", to_send}, {"stop", false} }; + data = { + { "content", to_send }, + { "stop", false }, + }; } else { // Generation is done, send extra information. data = { - {"content", to_send}, - {"stop", true}, - {"model", llama.params.model_alias}, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word}, - {"generated_text", llama.generated_text} }; + { "content", to_send }, + { "stop", true }, + { "model", llama.params.model_alias }, + { "tokens_predicted", llama.num_tokens_predicted }, + { "generation_settings", format_generation_settings(llama) }, + { "prompt", llama.params.prompt }, + { "stopping_word", llama.stopping_word }, + { "generated_text", llama.generated_text }, + }; } std::string str = @@ -919,31 +894,31 @@ int main(int argc, char ** argv) }; res.set_chunked_content_provider("text/event-stream", chunked_content_provider); } - }); + }); - svr.Options(R"(/.*)", [](const Request &, Response & res) - { - return res.set_content("", "application/json"); - }); + svr.Options(R"(/.*)", [](const Request &, Response & res) { + return res.set_content("", "application/json"); + }); - svr.Post("/tokenize", [&llama](const Request & req, Response & res) - { - json body = json::parse(req.body); - json data = { - {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; - return res.set_content(data.dump(llama.json_indent), "application/json"); - }); + svr.Post("/tokenize", [&llama](const Request & req, Response & res) { + json body = json::parse(req.body); + std::string content = body["content"].get(); + std::vector tokens = ::llama_tokenize(llama.ctx, content, false); + json data {{ "tokens", tokens }}; + return res.set_content(data.dump(llama.json_indent), "application/json"); + }); svr.set_logger([](const Request & req, const Response & res) { json log = { + { "time", time(NULL) }, + { "ip", req.remote_addr }, { "status", res.status }, { "path", req.path }, { "request", req.body }, { "response", res.body }, }; - fprintf(stdout, "http_request: %s\n", - log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); - }); + fprintf(stdout, "%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); + }); svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) { const auto * fmt = "500 Internal Server Error\n%s"; From 9612d12fbf981e3c5c2e1ac58bfdefae952e0c23 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 11 Jun 2023 16:18:39 +0300 Subject: [PATCH 099/121] big logging update --- examples/server/server.cpp | 216 +++++++++++++++++++------------------ 1 file changed, 111 insertions(+), 105 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 209db251568f9..be3b0e65dfc17 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,9 @@ #include "httplib.h" #include "json.hpp" +using namespace httplib; +using json = nlohmann::json; + struct server_params { std::string hostname = "127.0.0.1"; int32_t port = 8080; @@ -44,18 +47,6 @@ size_t find_partial_stop_string(const std::string & stop, const std::string & te return std::string::npos; } -static std::string debug_str(const std::string & s) { - std::string ret; - for (size_t i = 0; s[i]; i++) { - switch (s[i]) { - case '\n': ret += "\\n"; break; - case '"': ret += "\\\""; break; - default: ret += s[i]; break; - } - } - return ret; -} - template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; @@ -65,6 +56,36 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { return ret; } +static void server_log(const char * level, const char * function, int line, const char * message, nlohmann::ordered_json extra) { + nlohmann::ordered_json log { + { "timestamp", time(NULL) }, + { "level", level }, + { "function", function }, + { "line", line }, + { "message", message }, + }; + + if (!extra.empty()) { + log.merge_patch(extra); + } + + std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); + fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); +} + +static bool server_verbose = false; + +#define LOG_VERBOSE(MSG, ...) \ + do { \ + if (server_verbose) { \ + server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + } \ + } while(0) + +#define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) + struct llama_server_context { bool stream = false; bool has_next_token = false; @@ -82,7 +103,6 @@ struct llama_server_context { std::string stopping_word; - bool verbose = false; int json_indent = -1; int32_t multibyte_pending = 0; @@ -109,7 +129,7 @@ struct llama_server_context { params = params_; ctx = llama_init_from_gpt_params(params); if (ctx == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + LOG_ERROR("unable to load model", { { "model", params_.model } }); return false; } @@ -135,17 +155,12 @@ struct llama_server_context { new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); - if (verbose) { - fprintf(stderr, - "input truncated: {\n" - " n_ctx: %d,\n" - " n_keep: %d,\n" - " n_left: %d,\n" - " new_tokens: \"%s\",\n" - "}\n", - params.n_ctx, params.n_keep, n_left, - debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); - } + LOG_VERBOSE("input truncated", { + { "n_ctx", params.n_ctx }, + { "n_keep", params.n_keep }, + { "n_left", n_left }, + { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) }, + }); prompt_tokens = new_tokens; } else { @@ -162,17 +177,11 @@ struct llama_server_context { n_past--; } - if (verbose) { - fprintf(stderr, - "prompt: {\n" - " n_past: %zu,\n" - " cached: \"%s\",\n" - " to_eval: \"%s\",\n" - "}\n", - n_past, - debug_str(tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)).c_str(), - debug_str(tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())).c_str()); - } + LOG_VERBOSE("prompt ingested", { + { "n_past", n_past }, + { "cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past) }, + { "to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) }, + }); has_next_token = true; } @@ -194,16 +203,13 @@ struct llama_server_context { new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); embd = new_tokens; n_past = params.n_keep; - if (verbose) { - fprintf(stderr, - "input truncated: {\n" - " n_ctx: %d,\n" - " n_keep: %d,\n" - " n_left: %d,\n" - " new_tokens: \"%s\",\n" - "}\n", - params.n_ctx, params.n_keep, n_left, - debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); + if (server_verbose) { + LOG_VERBOSE("input truncated", { + { "n_ctx", params.n_ctx }, + { "n_keep", params.n_keep }, + { "n_left", n_left }, + { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) }, + }); } } @@ -213,7 +219,12 @@ struct llama_server_context { n_eval = params.n_batch; } if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_ERROR("failed to eval", { + { "n_eval", n_eval }, + { "n_past", n_past }, + { "n_threads", params.n_threads }, + { "embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) }, + }); has_next_token = false; return result; } @@ -301,9 +312,7 @@ struct llama_server_context { if (!embd.empty() && embd.back() == llama_token_eos()) { stopping_word = llama_token_to_str(ctx, embd.back()); has_next_token = false; - if (verbose) { - fprintf(stderr, "eos token found!\n"); - } + LOG_VERBOSE("eos token found", {}); return result; } @@ -365,34 +374,27 @@ struct llama_server_context { n_remain++; } - if (verbose) { - fprintf(stderr, - "next token: {\n" - " token: %d,\n" - " token_text: \"%s\",\n" - " has_next_token: %d,\n" - " n_remain: %ld,\n" - " num_tokens_predicted: %ld,\n" - " stopping_word: \"%s\",\n" - "}\n", - token, debug_str(llama_token_to_str(ctx, token)).c_str(), has_next_token, n_remain, num_tokens_predicted, - debug_str(stopping_word).c_str()); + if (server_verbose) { + LOG_VERBOSE("next token", { + { "token", token }, + { "token_text", llama_token_to_str(ctx, token) }, + { "has_next_token", has_next_token }, + { "n_remain", n_remain }, + { "num_tokens_predicted", num_tokens_predicted }, + { "stopping_word", stopping_word }, + }); } return token_text; } }; -using namespace httplib; - -using json = nlohmann::json; - -void server_print_usage(int /*argc*/, char ** argv, const gpt_params & params, const server_params & sparams) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); +void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) { + fprintf(stderr, "usage: %s [options]\n", argv0); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -v, --verbose verbose output (default: false)\n"); + fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", sparams.verbose ? "enabled" : "disabled"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); @@ -465,7 +467,7 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, } params.model_alias = argv[i]; } else if (arg == "-h" || arg == "--help") { - server_print_usage(argc, argv, default_params, default_sparams); + server_print_usage(argv[0], default_params, default_sparams); exit(0); } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { if (++i >= argc) { @@ -496,8 +498,8 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers = std::stoi(argv[i]); #else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " + "See main README.md for information on enabling GPU BLAS support", { { "n_gpu_layers", params.n_gpu_layers } }); #endif } else if (arg == "--tensor-split" || arg == "-ts") { @@ -523,7 +525,7 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, } } #else - fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); + LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {}); #endif // GGML_USE_CUBLAS } else if (arg == "--main-gpu" || arg == "-mg") { @@ -534,7 +536,7 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, #ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); + LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); #endif } else if (arg == "--lora") { if (++i >= argc) { @@ -551,20 +553,23 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, params.lora_base = argv[i]; } else if (arg == "-v" || arg == "--verbose") { sparams.verbose = true; +#ifndef SERVER_VERBOSE + LOG_WARNING("server.cpp is not built with verbose logging.", {}); +#endif } else if (arg == "--mlock") { params.use_mlock = true; } else if (arg == "--no-mmap") { params.use_mmap = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params, default_sparams); + server_print_usage(argv[0], default_params, default_sparams); exit(1); } } if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params, default_sparams); + server_print_usage(argv[0], default_params, default_sparams); exit(1); } } @@ -723,15 +728,7 @@ bool parse_options_completion(json body, llama_server_context & llama, Response [](const std::string & str) { return !str.empty(); }); } - if (llama.verbose) { - json tmp = format_generation_settings(llama); - fprintf(stderr, - "-------------------------\n" - "completion parameters: %s\n" - "full prompt: \"%s\"\n", - tmp.dump(4, ' ', false, json::error_handler_t::replace).c_str(), - debug_str(llama.params.prompt).c_str()); - } + LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama)); return true; } @@ -746,8 +743,9 @@ int main(int argc, char ** argv) { server_params_parse(argc, argv, sparams, params); - llama.verbose = sparams.verbose; - llama.json_indent = sparams.verbose ? 4 : -1; +#ifdef SERVER_VERBOSE + server_verbose = sparams.verbose; +#endif if (params.model_alias == "unknown") { params.model_alias = params.model; @@ -755,9 +753,15 @@ int main(int argc, char ** argv) { llama_init_backend(); - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, - std::thread::hardware_concurrency(), llama_print_system_info()); + LOG_INFO("build info", { + { "build", BUILD_NUMBER }, + { "commit", BUILD_COMMIT } + }); + LOG_INFO("system info", { + { "n_threads", params.n_threads }, + { "total_threads", std::thread::hardware_concurrency() }, + { "system_info", llama_print_system_info() }, + }); // load the model if (!llama.loadModel(params)) { @@ -875,14 +879,12 @@ int main(int argc, char ** argv) { json::error_handler_t::replace) + "\n\n"; - if (llama.verbose) { - fprintf(stderr, "to_send=%s", str.c_str()); - } + LOG_VERBOSE("data stream", { + { "to_send", str } + }); if (!sink.write(str.data(), str.size())) { - if (llama.verbose) { - fprintf(stderr, "stream closed\n"); - } + LOG_VERBOSE("stream closed", {}); llama_print_timings(llama.ctx); return false; } @@ -909,15 +911,14 @@ int main(int argc, char ** argv) { }); svr.set_logger([](const Request & req, const Response & res) { - json log = { - { "time", time(NULL) }, - { "ip", req.remote_addr }, + LOG_INFO("request", { + { "remote_addr", req.remote_addr }, + { "remote_port", req.remote_port }, { "status", res.status }, { "path", req.path }, { "request", req.body }, { "response", res.body }, - }; - fprintf(stdout, "%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); + }); }); svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) { @@ -939,13 +940,18 @@ int main(int argc, char ** argv) { svr.set_write_timeout(sparams.write_timeout); if (!svr.bind_to_port(sparams.hostname, sparams.port)) { - fprintf(stderr, "%s: ERROR: couldn't bind server to %s:%i\n", __func__, - sparams.hostname.c_str(), sparams.port); + LOG_ERROR("couldn't bind to server socket", { + { "hostname", sparams.hostname }, + { "port", sparams.port }, + }); return 1; } - fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, - sparams.hostname.c_str(), sparams.port); + LOG_INFO("HTTP server listening", { + { "hostname", sparams.hostname }, + { "port", sparams.port }, + }); + if (!svr.listen_after_bind()) { return 1; } From 6518f9c4822065d520576964c362b747f6577aec Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 11 Jun 2023 16:32:53 +0300 Subject: [PATCH 100/121] build settings --- .gitignore | 1 + Makefile | 2 ++ examples/server/CMakeLists.txt | 3 +++ examples/server/server.cpp | 8 ++++++-- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 9b6905ed4ef0c..2635a230036c6 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ models/* /embedding /benchmark-matmult /vdot +/server /Pipfile /libllama.so diff --git a/Makefile b/Makefile index 39ebfd04825da..66039d0fcde0c 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,8 @@ BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server + LLAMA_SERVER_VERBOSE ?= 1 +server: CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) endif default: $(BUILD_TARGETS) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 74126c687ffd3..1d992501b7e55 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET server) +option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) target_compile_definitions(${TARGET} PRIVATE @@ -8,6 +9,8 @@ target_compile_definitions(${TARGET} PRIVATE $<$: CPPHTTPLIB_NO_EXCEPTIONS=1 > + + SERVER_VERBOSE=$ ) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index be3b0e65dfc17..46a3d636a15ab 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,10 @@ #include "httplib.h" #include "json.hpp" +#ifndef SERVER_VERBOSE +#define SERVER_VERBOSE 1 +#endif + using namespace httplib; using json = nlohmann::json; @@ -553,7 +557,7 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, params.lora_base = argv[i]; } else if (arg == "-v" || arg == "--verbose") { sparams.verbose = true; -#ifndef SERVER_VERBOSE +#if SERVER_VERBOSE != 1 LOG_WARNING("server.cpp is not built with verbose logging.", {}); #endif } else if (arg == "--mlock") { @@ -743,7 +747,7 @@ int main(int argc, char ** argv) { server_params_parse(argc, argv, sparams, params); -#ifdef SERVER_VERBOSE +#if SERVER_VERBOSE == 1 server_verbose = sparams.verbose; #endif From 4148b9bd037e2f3647a73e215ea52fef0f384d26 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 12 Jun 2023 10:28:17 +0300 Subject: [PATCH 101/121] remove void --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 46a3d636a15ab..908c648ea0841 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -54,7 +54,7 @@ size_t find_partial_stop_string(const std::string & stop, const std::string & te template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; - for (; begin != end; (void)++begin) { + for (; begin != end; ++begin) { ret += llama_token_to_str(ctx, *begin); } return ret; From dff11a14d23b6bcd09c75430d35e29fa32da54c8 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 12 Jun 2023 16:52:21 +0300 Subject: [PATCH 102/121] json parsing improvements --- examples/server/server.cpp | 120 +++++++------------------------------ 1 file changed, 23 insertions(+), 97 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 908c648ea0841..24fffbc14b2e9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -607,98 +607,33 @@ json format_generation_settings(llama_server_context & llama) { }; } -bool parse_options_completion(json body, llama_server_context & llama, Response & res) { +bool parse_options_completion(json body, llama_server_context & llama) { gpt_params default_params; - if (!body["stream"].is_null()) { - llama.stream = body["stream"].get(); - } else { - llama.stream = false; - } - if (!body["n_predict"].is_null()) { - llama.params.n_predict = body["n_predict"].get(); - } else { - llama.params.n_predict = default_params.n_predict; - } - if (!body["top_k"].is_null()) { - llama.params.top_k = body["top_k"].get(); - } else { - llama.params.top_k = default_params.top_k; - } - if (!body["top_p"].is_null()) { - llama.params.top_p = body["top_p"].get(); - } else { - llama.params.top_p = default_params.top_p; - } - if (!body["tfs_z"].is_null()) { - llama.params.tfs_z = body["tfs_z"].get(); - } else { - llama.params.tfs_z = default_params.tfs_z; - } - if (!body["typical_p"].is_null()) { - llama.params.typical_p = body["typical_p"].get(); - } else { - llama.params.typical_p = default_params.typical_p; - } - if (!body["repeat_last_n"].is_null()) { - llama.params.repeat_last_n = body["repeat_last_n"].get(); - } else { - llama.params.repeat_last_n = default_params.repeat_last_n; - } - if (!body["temperature"].is_null()) { - llama.params.temp = body["temperature"].get(); - } else { - llama.params.temp = default_params.temp; - } - if (!body["repeat_penalty"].is_null()) { - llama.params.repeat_penalty = body["repeat_penalty"].get(); - } else { - llama.params.repeat_penalty = default_params.repeat_penalty; - } - if (!body["presence_penalty"].is_null()) { - llama.params.presence_penalty = body["presence_penalty"].get(); - } else { - llama.params.presence_penalty = default_params.presence_penalty; - } - if (!body["frequency_penalty"].is_null()) { - llama.params.frequency_penalty = body["frequency_penalty"].get(); - } else { - llama.params.frequency_penalty = default_params.frequency_penalty; - } - if (!body["mirostat"].is_null()) { - llama.params.mirostat = body["mirostat"].get(); - } else { - llama.params.mirostat = default_params.mirostat; - } - if (!body["mirostat_tau"].is_null()) { - llama.params.mirostat_tau = body["mirostat_tau"].get(); - } else { - llama.params.mirostat_tau = default_params.mirostat_tau; - } - if (!body["mirostat_eta"].is_null()) { - llama.params.mirostat_eta = body["mirostat_eta"].get(); - } else { - llama.params.mirostat_eta = default_params.mirostat_eta; - } - if (!body["penalize_nl"].is_null()) { - llama.params.penalize_nl = body["penalize_nl"].get(); - } else { - llama.params.penalize_nl = default_params.penalize_nl; - } - if (!body["n_keep"].is_null()) { - llama.params.n_keep = body["n_keep"].get(); - } else { - llama.params.n_keep = default_params.n_keep; - } - if (!body["seed"].is_null()) { - llama.params.seed = body["seed"].get(); - } else { - llama.params.seed = time(NULL); - } + + llama.stream = body.value("stream", false); + llama.params.n_predict = body.value("n_predict", default_params.n_predict); + llama.params.top_k = body.value("top_k", default_params.top_k); + llama.params.top_p = body.value("top_p", default_params.top_p); + llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z); + llama.params.typical_p = body.value("typical_p", default_params.typical_p); + llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n); + llama.params.temp = body.value("temperature", default_params.temp); + llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty); + llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty); + llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty); + llama.params.mirostat = body.value("mirostat", default_params.mirostat); + llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau); + llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta); + llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl); + llama.params.n_keep = body.value("n_keep", default_params.n_keep); + llama.params.seed = body.value("seed", default_params.seed); + llama.params.prompt = body.value("prompt", default_params.prompt); llama.params.logit_bias.clear(); - if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) { + if (body.value("ignore_eos", false)) { llama.params.logit_bias[llama_token_eos()] = -INFINITY; } + if (body["logit_bias"].is_array()) { int n_vocab = llama_n_vocab(llama.ctx); for (const auto & el : body["logit_bias"]) { @@ -715,15 +650,6 @@ bool parse_options_completion(json body, llama_server_context & llama, Response } } - if (!body["prompt"].is_null()) { - llama.params.prompt = body["prompt"].get(); - } else { - json data = { {"status", "error"}, {"reason", "You need to provide a prompt"} }; - res.set_content(data.dump(llama.json_indent), "application/json"); - res.status = 400; - return false; - } - llama.params.antiprompt.clear(); if (!body["stop"].is_null()) { const auto stop = body["stop"].get>(); @@ -788,7 +714,7 @@ int main(int argc, char ** argv) { llama.rewind(); llama_reset_timings(llama.ctx); - if (!parse_options_completion(json::parse(req.body), llama, res)) { + if (!parse_options_completion(json::parse(req.body), llama)) { return; } From 13cf6929b7c979b83a755f529fb14434734ce58a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 12 Jun 2023 17:29:25 +0300 Subject: [PATCH 103/121] more json changes and stop info --- examples/server/server.cpp | 132 ++++++++++++++++++++++--------------- 1 file changed, 79 insertions(+), 53 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 24fffbc14b2e9..8c02dd9773401 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -105,6 +105,10 @@ struct llama_server_context { llama_context * ctx = nullptr; gpt_params params; + bool truncated = false; + bool stopped_eos = false; + bool stopped_word = false; + bool stopped_limit = false; std::string stopping_word; int json_indent = -1; @@ -122,6 +126,10 @@ struct llama_server_context { num_tokens_predicted = 0; generated_text = ""; generated_text.reserve(params.n_ctx); + truncated = false; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; stopping_word = ""; multibyte_pending = 0; @@ -166,6 +174,7 @@ struct llama_server_context { { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) }, }); + truncated = true; prompt_tokens = new_tokens; } else { const size_t ps = prompt_tokens.size(); @@ -207,14 +216,13 @@ struct llama_server_context { new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); embd = new_tokens; n_past = params.n_keep; - if (server_verbose) { - LOG_VERBOSE("input truncated", { - { "n_ctx", params.n_ctx }, - { "n_keep", params.n_keep }, - { "n_left", n_left }, - { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) }, - }); - } + truncated = true; + LOG_VERBOSE("input truncated", { + { "n_ctx", params.n_ctx }, + { "n_keep", params.n_keep }, + { "n_left", n_left }, + { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) }, + }); } while (n_past < embd.size()) { @@ -314,8 +322,9 @@ struct llama_server_context { --n_remain; if (!embd.empty() && embd.back() == llama_token_eos()) { - stopping_word = llama_token_to_str(ctx, embd.back()); + //stopping_word = llama_token_to_str(ctx, embd.back()); has_next_token = false; + stopped_eos = true; LOG_VERBOSE("eos token found", {}); return result; } @@ -341,6 +350,7 @@ struct llama_server_context { (stop_pos == std::string::npos || pos < stop_pos)) { if (type == STOP_FULL) { stopping_word = word; + stopped_word = true; has_next_token = false; } stop_pos = pos; @@ -378,17 +388,22 @@ struct llama_server_context { n_remain++; } - if (server_verbose) { - LOG_VERBOSE("next token", { - { "token", token }, - { "token_text", llama_token_to_str(ctx, token) }, - { "has_next_token", has_next_token }, - { "n_remain", n_remain }, - { "num_tokens_predicted", num_tokens_predicted }, - { "stopping_word", stopping_word }, - }); + if (!has_next_token && n_remain == 0) { + stopped_limit = true; } + LOG_VERBOSE("next token", { + { "token", token }, + { "token_text", llama_token_to_str(ctx, token) }, + { "has_next_token", has_next_token }, + { "n_remain", n_remain }, + { "num_tokens_predicted", num_tokens_predicted }, + { "stopped_eos", stopped_eos }, + { "stopped_word", stopped_word }, + { "stopped_limit", stopped_limit }, + { "stopping_word", stopping_word }, + }); + return token_text; } }; @@ -578,7 +593,7 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, } } -json format_generation_settings(llama_server_context & llama) { +static json format_generation_settings(llama_server_context & llama) { const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); @@ -607,6 +622,35 @@ json format_generation_settings(llama_server_context & llama) { }; } +static json format_final_response(llama_server_context & llama, const std::string & content) { + return json { + { "content", content }, + { "stop", true }, + { "model", llama.params.model_alias }, + { "tokens_predicted", llama.num_tokens_predicted }, + { "generation_settings", format_generation_settings(llama) }, + { "prompt", llama.params.prompt }, + { "truncated", llama.truncated }, + { "stopped_eos", llama.stopped_eos }, + { "stopped_word", llama.stopped_word }, + { "stopped_limit", llama.stopped_limit }, + { "stopping_word", llama.stopping_word }, + }; +} + +static json format_partial_response(const std::string & content) { + return json { + { "content", content }, + { "stop", false }, + }; +} + +static json format_tokenizer_response(const std::vector & tokens) { + return json { + { "tokens", tokens } + }; +} + bool parse_options_completion(json body, llama_server_context & llama) { gpt_params default_params; @@ -663,6 +707,17 @@ bool parse_options_completion(json body, llama_server_context & llama) { return true; } +static void log_server_request(const Request & req, const Response & res) { + LOG_INFO("request", { + { "remote_addr", req.remote_addr }, + { "remote_port", req.remote_port }, + { "status", res.status }, + { "path", req.path }, + { "request", req.body }, + { "response", res.body }, + }); +} + int main(int argc, char ** argv) { // own arguments required by this example gpt_params params; @@ -739,15 +794,7 @@ int main(int argc, char ** argv) { llama.generated_text.end()); } - json data { - { "content", llama.generated_text }, - { "stop", true }, - { "model", llama.params.model_alias }, - { "tokens_predicted", llama.num_tokens_predicted }, - { "generation_settings", format_generation_settings(llama) }, - { "prompt", llama.params.prompt }, - { "stopping_word", llama.stopping_word }, - }; + json data = format_final_response(llama, llama.generated_text); llama_print_timings(llama.ctx); @@ -785,22 +832,10 @@ int main(int argc, char ** argv) { json data; if (llama.has_next_token) { - data = { - { "content", to_send }, - { "stop", false }, - }; + data = format_partial_response(to_send); } else { // Generation is done, send extra information. - data = { - { "content", to_send }, - { "stop", true }, - { "model", llama.params.model_alias }, - { "tokens_predicted", llama.num_tokens_predicted }, - { "generation_settings", format_generation_settings(llama) }, - { "prompt", llama.params.prompt }, - { "stopping_word", llama.stopping_word }, - { "generated_text", llama.generated_text }, - }; + data = format_final_response(llama, to_send); } std::string str = @@ -836,20 +871,11 @@ int main(int argc, char ** argv) { json body = json::parse(req.body); std::string content = body["content"].get(); std::vector tokens = ::llama_tokenize(llama.ctx, content, false); - json data {{ "tokens", tokens }}; + json data = format_tokenizer_response(tokens); return res.set_content(data.dump(llama.json_indent), "application/json"); }); - svr.set_logger([](const Request & req, const Response & res) { - LOG_INFO("request", { - { "remote_addr", req.remote_addr }, - { "remote_port", req.remote_port }, - { "status", res.status }, - { "path", req.path }, - { "request", req.body }, - { "response", res.body }, - }); - }); + svr.set_logger(log_server_request); svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) { const auto * fmt = "500 Internal Server Error\n%s"; From b91200a2e5a4b74cdfd27a7af2cb4d5875d316a9 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 12 Jun 2023 18:34:01 +0300 Subject: [PATCH 104/121] javascript chat update. --- examples/server/README.md | 2 +- examples/server/chat.mjs | 112 +++++++++++++++++++++++--------------- 2 files changed, 70 insertions(+), 44 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 56399be291c6c..8b53dc36e97d2 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -167,7 +167,7 @@ node . ### Interactive mode Check the sample in [chat.mjs](chat.mjs). -Run with node: +Run with NodeJS version 16 or later: ```sh node chat.mjs diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs index 349937e940a08..34ff90343183b 100644 --- a/examples/server/chat.mjs +++ b/examples/server/chat.mjs @@ -1,61 +1,87 @@ -import * as readline from 'node:readline/promises'; -import { stdin as input, stdout as output } from 'node:process'; +import * as readline from 'node:readline' +import { stdin, stdout } from 'node:process' const chat = [ - { human: "Hello, Assistant.", - assistant: "Hello. How may I help you today?" }, - { human: "Please tell me the largest city in Europe.", - assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia." }, + { + human: "Hello, Assistant.", + assistant: "Hello. How may I help you today?" + }, + { + human: "Please tell me the largest city in Europe.", + assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia." + }, ] +const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.` + function format_prompt(question) { - return "A chat between a curious human and an artificial intelligence assistant. " - + "The assistant gives helpful, detailed, and polite answers to the human's questions.\n" - + chat.map(m => `### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n") - + `\n### Human: ${question}\n### Assistant:` + return `${instruction}\n${ + chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n") + }\n### Human: ${question}\n### Assistant:` } -async function ChatCompletion(question) { - const result = await fetch("http://127.0.0.1:8080/completion", { - method: 'POST', - body: JSON.stringify({ - prompt: format_prompt(question), - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: 29, - n_predict: 256, - stop: ["\n### Human:"], // stop completion after generating this - stream: true, +async function tokenize(content) { + const result = await fetch("http://127.0.0.1:8080/tokenize", { + method: 'POST', + body: JSON.stringify({ content }) }) - }) - - if (!result.ok) { - return; - } - let answer = '' - - for await (var chunk of result.body) { - const t = Buffer.from(chunk).toString('utf8') - if (t.startsWith('data: ')) { - const message = JSON.parse(t.substring(6)) - answer += message.content - process.stdout.write(message.content) - if (message.stop) break; + if (!result.ok) { + return [] } - } - process.stdout.write('\n') - chat.push({ human: question, assistant: answer }) + return await result.json().tokens } -const rl = readline.createInterface({ input, output }); +const n_keep = await tokenize(instruction).length -while(true) { +async function chat_completion(question) { + const result = await fetch("http://127.0.0.1:8080/completion", { + method: 'POST', + body: JSON.stringify({ + prompt: format_prompt(question), + temperature: 0.2, + top_k: 40, + top_p: 0.9, + n_keep: n_keep, + n_predict: 256, + stop: ["\n### Human:"], // stop completion after generating this + stream: true, + }) + }) + + if (!result.ok) { + return + } - const question = await rl.question('> ') - await ChatCompletion(question); + let answer = '' + for await (var chunk of result.body) { + const t = Buffer.from(chunk).toString('utf8') + if (t.startsWith('data: ')) { + const message = JSON.parse(t.substring(6)) + answer += message.content + process.stdout.write(message.content) + if (message.stop) { + if (message.truncated) { + chat.shift() + } + break + } + } + } + + process.stdout.write('\n') + chat.push({ human: question, assistant: answer.trimStart() }) } +const rl = readline.createInterface({ input: stdin, output: stdout }); + +const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => { + rl.question(query, options, resolve) +}); + +while(true) { + const question = await readlineQuestion(rl, '> ') + await chat_completion(question) +} From 15103379012a8ff3031fb53ee9e7b57628abdae8 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 12 Jun 2023 18:34:12 +0300 Subject: [PATCH 105/121] fix make flags propagation --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 66039d0fcde0c..4b870bd47aabe 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server LLAMA_SERVER_VERBOSE ?= 1 -server: CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) +server: private CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) endif default: $(BUILD_TARGETS) From fc4264d14a0b8d25d1d0c4c03e4b9565942f37e9 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 12 Jun 2023 18:43:40 +0300 Subject: [PATCH 106/121] api url --- examples/server/chat.mjs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs index 34ff90343183b..8269e2592733b 100644 --- a/examples/server/chat.mjs +++ b/examples/server/chat.mjs @@ -1,6 +1,8 @@ import * as readline from 'node:readline' import { stdin, stdout } from 'node:process' +const API_URL = 'http://127.0.0.1:8080' + const chat = [ { human: "Hello, Assistant.", @@ -21,7 +23,7 @@ function format_prompt(question) { } async function tokenize(content) { - const result = await fetch("http://127.0.0.1:8080/tokenize", { + const result = await fetch(`${API_URL}/tokenize`, { method: 'POST', body: JSON.stringify({ content }) }) @@ -36,7 +38,7 @@ async function tokenize(content) { const n_keep = await tokenize(instruction).length async function chat_completion(question) { - const result = await fetch("http://127.0.0.1:8080/completion", { + const result = await fetch(`${API_URL}/completion`, { method: 'POST', body: JSON.stringify({ prompt: format_prompt(question), From 28694f7ac99f02d836defd44121f36ee4c74f72e Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 12 Jun 2023 19:53:13 +0300 Subject: [PATCH 107/121] add a simple bash script too --- examples/server/README.md | 8 ++++++ examples/server/chat.sh | 52 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 examples/server/chat.sh diff --git a/examples/server/README.md b/examples/server/README.md index 8b53dc36e97d2..824fd5194273e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -172,3 +172,11 @@ Run with NodeJS version 16 or later: ```sh node chat.mjs ``` + +Another sample in [chat.sh](chat.sh). +Requires [bash](https://www.gnu.org/software/bash/), [curl](https://curl.se) and [jq](https://jqlang.github.io/jq/). +Run with bash: + +```sh +bash chat.sh +``` diff --git a/examples/server/chat.sh b/examples/server/chat.sh new file mode 100644 index 0000000000000..dd2d1953b06fd --- /dev/null +++ b/examples/server/chat.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +API_URL="http://127.0.0.1:8080" + +CHAT=( + "Hello, Assistant." + "Hello. How may I help you today?" + "Please tell me the largest city in Europe." + "Sure. The largest city in Europe is Moscow, the capital of Russia." +) + +INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + +format_prompt() { + echo -n "${INSTRUCTION}" + printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1" +} + +tokenize() { + echo -n "$1" | jq -Rs '{content:.}' | curl \ + --silent \ + --request POST \ + --url "${API_URL}/tokenize" \ + --data "@-" | jq '.tokens[]' +} + +N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l) + +chat_completion() { + CONTENT=$(format_prompt "$1" | jq -Rs --argjson n_keep $N_KEEP '{ + prompt: ., + temperature: 0.2, + top_k: 40, + top_p: 0.9, + n_keep: $n_keep, + n_predict: 256, + stop: ["\n### Human:"] + }' | curl \ + --silent \ + --request POST \ + --url "${API_URL}/completion" \ + --data "@-" | jq -r '.content | sub("^\\s*"; "")') + + printf "$CONTENT\n" + + CHAT+=("$1" "$CONTENT") +} + +while true; do + read -p "> " QUESTION + chat_completion "${QUESTION}" +done From 429ed950af285fb0675a7dcfe04e90b884cb4632 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 12 Jun 2023 20:46:53 +0300 Subject: [PATCH 108/121] move CPPHTTPLIB settings inside server Since they aren't configurable and were missing from the Makefile. --- examples/server/CMakeLists.txt | 7 ------- examples/server/server.cpp | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 1d992501b7e55..07ba76ad35bbd 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -3,13 +3,6 @@ option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) target_compile_definitions(${TARGET} PRIVATE - # single thread - CPPHTTPLIB_THREAD_POOL_COUNT=1 - # crash the server in debug mode, otherwise send an http 500 error - $<$: - CPPHTTPLIB_NO_EXCEPTIONS=1 - > - SERVER_VERBOSE=$ ) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8c02dd9773401..2fd64d543c2f7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2,6 +2,13 @@ #include "llama.h" #include "build-info.h" +// single thread +#define CPPHTTPLIB_THREAD_POOL_COUNT 1 +#ifndef NDEBUG +// crash the server in debug mode, otherwise send an http 500 error +#define CPPHTTPLIB_NO_EXCEPTIONS 1 +#endif + #include "httplib.h" #include "json.hpp" From f344d090f76d3208f687d8aaa8d07fbcd1d84985 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 12 Jun 2023 22:49:08 +0300 Subject: [PATCH 109/121] streaming shell script --- examples/server/chat.sh | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/examples/server/chat.sh b/examples/server/chat.sh index dd2d1953b06fd..2429da6652256 100644 --- a/examples/server/chat.sh +++ b/examples/server/chat.sh @@ -17,36 +17,49 @@ format_prompt() { } tokenize() { - echo -n "$1" | jq -Rs '{content:.}' | curl \ + curl \ --silent \ --request POST \ --url "${API_URL}/tokenize" \ - --data "@-" | jq '.tokens[]' + --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ + | jq '.tokens[]' } N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l) chat_completion() { - CONTENT=$(format_prompt "$1" | jq -Rs --argjson n_keep $N_KEEP '{ + DATA="$(format_prompt "$1" | jq -Rs --argjson n_keep $N_KEEP '{ prompt: ., temperature: 0.2, top_k: 40, top_p: 0.9, n_keep: $n_keep, n_predict: 256, - stop: ["\n### Human:"] - }' | curl \ + stop: ["\n### Human:"], + stream: true + }')" + + ANSWER='' + + curl \ --silent \ + --no-buffer \ --request POST \ --url "${API_URL}/completion" \ - --data "@-" | jq -r '.content | sub("^\\s*"; "")') + --data-raw "${DATA}" | while IFS= read -r LINE; do + if [[ $LINE = data:* ]]; then + CONTENT="$(echo "${LINE:5}" | jq -r '.content')" + printf "%s" "${CONTENT}" + ANSWER+="${CONTENT}" + fi + done - printf "$CONTENT\n" + printf "\n" - CHAT+=("$1" "$CONTENT") + CHAT+=("$1" "${ANSWER:1}") } while true; do - read -p "> " QUESTION + read -e -p "> " QUESTION chat_completion "${QUESTION}" done From 6d72f0f070e2a7f8e98e040aac5f5325c3616df6 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Mon, 12 Jun 2023 19:44:53 -0400 Subject: [PATCH 110/121] Make chat shell script work by piping the content out of the subshell. --- examples/server/chat.sh | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/server/chat.sh b/examples/server/chat.sh index 2429da6652256..ddd4418bd01be 100644 --- a/examples/server/chat.sh +++ b/examples/server/chat.sh @@ -41,24 +41,25 @@ chat_completion() { ANSWER='' - curl \ - --silent \ - --no-buffer \ - --request POST \ - --url "${API_URL}/completion" \ - --data-raw "${DATA}" | while IFS= read -r LINE; do + while IFS= read -r LINE; do if [[ $LINE = data:* ]]; then CONTENT="$(echo "${LINE:5}" | jq -r '.content')" printf "%s" "${CONTENT}" ANSWER+="${CONTENT}" fi - done + done < <(curl \ + --silent \ + --no-buffer \ + --request POST \ + --url "${API_URL}/completion" \ + --data-raw "${DATA}") printf "\n" CHAT+=("$1" "${ANSWER:1}") } + while true; do read -e -p "> " QUESTION chat_completion "${QUESTION}" From 9d564db9ae105df85a11081e2e843cf354f211ce Mon Sep 17 00:00:00 2001 From: anon Date: Mon, 12 Jun 2023 21:30:33 -0300 Subject: [PATCH 111/121] trim response and trim trailing space in prompt Also add "-r" to read because of this: https://www.shellcheck.net/wiki/SC2162 --- examples/server/chat.sh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/examples/server/chat.sh b/examples/server/chat.sh index ddd4418bd01be..e2b50fb4415d9 100644 --- a/examples/server/chat.sh +++ b/examples/server/chat.sh @@ -11,6 +11,17 @@ CHAT=( INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." +trim() { + shopt -s extglob + set -- "${1##+([[:space:]])}" + printf "%s" "${1%%+([[:space:]])}" +} + +trim_trailing() { + shopt -s extglob + printf "%s" "${1%%+([[:space:]])}" +} + format_prompt() { echo -n "${INSTRUCTION}" printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1" @@ -28,7 +39,8 @@ tokenize() { N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l) chat_completion() { - DATA="$(format_prompt "$1" | jq -Rs --argjson n_keep $N_KEEP '{ + PROMPT="$(trim_trailing "$(format_prompt "$1")")" + DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{ prompt: ., temperature: 0.2, top_k: 40, @@ -56,11 +68,10 @@ chat_completion() { printf "\n" - CHAT+=("$1" "${ANSWER:1}") + CHAT+=("$1" "$(trim "$ANSWER")") } - while true; do - read -e -p "> " QUESTION + read -r -e -p "> " QUESTION chat_completion "${QUESTION}" done From b8b8a6ed007c1cd4c5f36a581389b08b2b723531 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 13 Jun 2023 12:58:02 +0300 Subject: [PATCH 112/121] Add log flush --- examples/server/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2fd64d543c2f7..d677cdd1a02da 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -82,6 +82,7 @@ static void server_log(const char * level, const char * function, int line, cons std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); + fflush(stdout); } static bool server_verbose = false; From 6627a0254004d87fcff0207586b0ad0757c8a557 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 13 Jun 2023 13:36:31 +0300 Subject: [PATCH 113/121] Allow overriding the server address --- examples/server/chat.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/chat.sh b/examples/server/chat.sh index e2b50fb4415d9..a89f8e908c403 100644 --- a/examples/server/chat.sh +++ b/examples/server/chat.sh @@ -1,6 +1,6 @@ #!/bin/bash -API_URL="http://127.0.0.1:8080" +API_URL="${API_URL:-http://127.0.0.1:8080}" CHAT=( "Hello, Assistant." From 1f3945236a6208f5e816e81390d82c413e0a9dfc Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 13 Jun 2023 14:14:29 -0300 Subject: [PATCH 114/121] remove old verbose variable And expand macro to nothing when verbose is disabled with compilation flags. --- examples/server/server.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d677cdd1a02da..13df0c5397963 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -24,7 +24,6 @@ struct server_params { int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; - bool verbose = false; }; static size_t common_part(const std::vector & a, const std::vector & b) { @@ -87,12 +86,16 @@ static void server_log(const char * level, const char * function, int line, cons static bool server_verbose = false; -#define LOG_VERBOSE(MSG, ...) \ +#if SERVER_VERBOSE != 1 +# define LOG_VERBOSE(MSG, ...) +#else +# define LOG_VERBOSE(MSG, ...) \ do { \ if (server_verbose) { \ server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ } \ } while(0) +#endif #define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) @@ -421,7 +424,7 @@ void server_print_usage(const char * argv0, const gpt_params & params, const ser fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", sparams.verbose ? "enabled" : "disabled"); + fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); @@ -579,9 +582,10 @@ void server_params_parse(int argc, char ** argv, server_params & sparams, } params.lora_base = argv[i]; } else if (arg == "-v" || arg == "--verbose") { - sparams.verbose = true; #if SERVER_VERBOSE != 1 LOG_WARNING("server.cpp is not built with verbose logging.", {}); +#else + server_verbose = true; #endif } else if (arg == "--mlock") { params.use_mlock = true; @@ -736,10 +740,6 @@ int main(int argc, char ** argv) { server_params_parse(argc, argv, sparams, params); -#if SERVER_VERBOSE == 1 - server_verbose = sparams.verbose; -#endif - if (params.model_alias == "unknown") { params.model_alias = params.model; } From 99ef967d42b240275eb2c0de5f654163edcdde30 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 13 Jun 2023 14:17:22 -0300 Subject: [PATCH 115/121] add static prefix to the other functions too --- examples/server/server.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 13df0c5397963..42eac19833333 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -37,12 +37,13 @@ enum stop_type { STOP_PARTIAL, }; -bool ends_with(const std::string & str, const std::string & suffix) { +static bool ends_with(const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } -size_t find_partial_stop_string(const std::string & stop, const std::string & text) { +static size_t find_partial_stop_string(const std::string & stop, + const std::string & text) { if (!text.empty() && !stop.empty()) { const char text_last_char = text.back(); for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { @@ -419,7 +420,8 @@ struct llama_server_context { } }; -void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) { +static void server_print_usage(const char * argv0, const gpt_params & params, + const server_params & sparams) { fprintf(stderr, "usage: %s [options]\n", argv0); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); @@ -456,8 +458,8 @@ void server_print_usage(const char * argv0, const gpt_params & params, const ser fprintf(stderr, "\n"); } -void server_params_parse(int argc, char ** argv, server_params & sparams, - gpt_params & params) { +static void server_params_parse(int argc, char ** argv, server_params & sparams, + gpt_params & params) { gpt_params default_params; server_params default_sparams; std::string arg; From 575cf23862ec818bdc6c72f5e0b7eaa64cb0ae24 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 13 Jun 2023 14:21:40 -0300 Subject: [PATCH 116/121] remove json_indent variable --- examples/server/server.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 42eac19833333..1e1255fa545a3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -122,8 +122,6 @@ struct llama_server_context { bool stopped_word = false; bool stopped_limit = false; std::string stopping_word; - - int json_indent = -1; int32_t multibyte_pending = 0; ~llama_server_context() { @@ -808,11 +806,9 @@ int main(int argc, char ** argv) { llama_print_timings(llama.ctx); - res.set_content( - data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), - "application/json"); - } - else { + res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), + "application/json"); + } else { const auto chunked_content_provider = [&](size_t, DataSink & sink) { size_t sent_count = 0; @@ -850,8 +846,7 @@ int main(int argc, char ** argv) { std::string str = "data: " + - data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false, - json::error_handler_t::replace) + + data.dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n"; LOG_VERBOSE("data stream", { @@ -882,7 +877,7 @@ int main(int argc, char ** argv) { std::string content = body["content"].get(); std::vector tokens = ::llama_tokenize(llama.ctx, content, false); json data = format_tokenizer_response(tokens); - return res.set_content(data.dump(llama.json_indent), "application/json"); + return res.set_content(data.dump(), "application/json"); }); svr.set_logger(log_server_request); From 7df316b728fb1ebd37c31d06aad3872591b9faf8 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 13 Jun 2023 14:28:52 -0300 Subject: [PATCH 117/121] fix linter warnings + make variables const --- examples/server/server.cpp | 71 ++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1e1255fa545a3..cb95291695aa1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -67,9 +67,10 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { return ret; } -static void server_log(const char * level, const char * function, int line, const char * message, nlohmann::ordered_json extra) { +static void server_log(const char * level, const char * function, int line, + const char * message, const nlohmann::ordered_json & extra) { nlohmann::ordered_json log { - { "timestamp", time(NULL) }, + { "timestamp", time(nullptr) }, { "level", level }, { "function", function }, { "line", line }, @@ -80,7 +81,7 @@ static void server_log(const char * level, const char * function, int line, cons log.merge_patch(extra); } - std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); + const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); fflush(stdout); } @@ -105,7 +106,7 @@ static bool server_verbose = false; struct llama_server_context { bool stream = false; bool has_next_token = false; - std::string generated_text = ""; + std::string generated_text; size_t num_tokens_predicted = 0; size_t n_past = 0; @@ -150,7 +151,7 @@ struct llama_server_context { bool loadModel(const gpt_params & params_) { params = params_; ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + if (ctx == nullptr) { LOG_ERROR("unable to load model", { { "model", params_.model } }); return false; } @@ -267,7 +268,9 @@ struct llama_server_context { const float mirostat_tau = params.mirostat_tau; const float mirostat_eta = params.mirostat_eta; const bool penalize_nl = params.penalize_nl; - llama_token id = 0; { + llama_token id = 0; + + { auto * logits = llama_get_logits(ctx); auto n_vocab = llama_n_vocab(ctx); @@ -344,7 +347,7 @@ struct llama_server_context { } size_t findStoppingStrings(const std::string & text, const size_t last_token_size, - const stop_type type) { + const stop_type type) { size_t stop_pos = std::string::npos; for (const std::string & word : params.antiprompt) { size_t pos; @@ -370,9 +373,9 @@ struct llama_server_context { } std::string doCompletion() { - llama_token token = nextToken(); + const llama_token token = nextToken(); - std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); + const std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); generated_text += token_text; if (multibyte_pending > 0) { @@ -546,12 +549,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, std::vector split_arg{ it, {} }; GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); + for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) { + if (i_device < split_arg.size()) { + params.tensor_split[i_device] = std::stof(split_arg[i_device]); } else { - params.tensor_split[i] = 0.0f; + params.tensor_split[i_device] = 0.0f; } } #else @@ -663,7 +666,7 @@ static json format_tokenizer_response(const std::vector & tokens) { }; } -bool parse_options_completion(json body, llama_server_context & llama) { +static void parse_options_completion(const json & body, llama_server_context & llama) { gpt_params default_params; llama.stream = body.value("stream", false); @@ -691,7 +694,7 @@ bool parse_options_completion(json body, llama_server_context & llama) { } if (body["logit_bias"].is_array()) { - int n_vocab = llama_n_vocab(llama.ctx); + const int n_vocab = llama_n_vocab(llama.ctx); for (const auto & el : body["logit_bias"]) { if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { llama_token tok = el[0].get(); @@ -715,8 +718,6 @@ bool parse_options_completion(json body, llama_server_context & llama) { } LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama)); - - return true; } static void log_server_request(const Request & req, const Response & res) { @@ -773,13 +774,10 @@ int main(int argc, char ** argv) { }); svr.Post("/completion", [&llama](const Request & req, Response & res) { - llama.rewind(); llama_reset_timings(llama.ctx); - if (!parse_options_completion(json::parse(req.body), llama)) { - return; - } + parse_options_completion(json::parse(req.body), llama); llama.loadPrompt(); llama.beginCompletion(); @@ -802,7 +800,7 @@ int main(int argc, char ** argv) { llama.generated_text.end()); } - json data = format_final_response(llama, llama.generated_text); + const json data = format_final_response(llama, llama.generated_text); llama_print_timings(llama.ctx); @@ -820,7 +818,7 @@ int main(int argc, char ** argv) { size_t pos = std::min(sent_count, llama.generated_text.size()); - const char* str_test = llama.generated_text.c_str() + pos; + const std::string str_test = llama.generated_text.substr(pos); size_t stop_pos = llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); if (stop_pos != std::string::npos) { @@ -833,18 +831,15 @@ int main(int argc, char ** argv) { STOP_PARTIAL); } - std::string to_send = llama.generated_text.substr(pos, stop_pos); + const std::string to_send = llama.generated_text.substr(pos, stop_pos); sent_count += to_send.size(); - json data; - if (llama.has_next_token) { - data = format_partial_response(to_send); - } else { - // Generation is done, send extra information. - data = format_final_response(llama, to_send); - } + const json data = llama.has_next_token + ? format_partial_response(to_send) + // Generation is done, send extra information. + : format_final_response(llama, to_send); - std::string str = + const std::string str = "data: " + data.dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n"; @@ -873,10 +868,10 @@ int main(int argc, char ** argv) { }); svr.Post("/tokenize", [&llama](const Request & req, Response & res) { - json body = json::parse(req.body); - std::string content = body["content"].get(); - std::vector tokens = ::llama_tokenize(llama.ctx, content, false); - json data = format_tokenizer_response(tokens); + const json body = json::parse(req.body); + const std::string content = body["content"].get(); + const std::vector tokens = llama_tokenize(llama.ctx, content, false); + const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); @@ -887,14 +882,14 @@ int main(int argc, char ** argv) { char buf[BUFSIZ]; try { std::rethrow_exception(std::move(ep)); - } catch (std::exception& e) { + } catch (std::exception & e) { snprintf(buf, sizeof(buf), fmt, e.what()); } catch (...) { snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); } res.set_content(buf, "text/plain"); res.status = 500; - }); + }); // set timeouts and change hostname and port svr.set_read_timeout(sparams.read_timeout); From 7a48ade7ef3da763f42afa438166da20697d5647 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 13 Jun 2023 14:46:40 -0300 Subject: [PATCH 118/121] fix comment indentation --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index cb95291695aa1..3a87f5116bfed 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -385,10 +385,10 @@ struct llama_server_context { // 2-byte characters: 110xxxxx 10xxxxxx if ((c & 0xE0) == 0xC0) { multibyte_pending = 1; - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx } else if ((c & 0xF0) == 0xE0) { multibyte_pending = 2; - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx } else if ((c & 0xF8) == 0xF0) { multibyte_pending = 3; } else { From 546f850796423d0e14b3334f59575fdb442baebc Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 14 Jun 2023 17:41:58 +0300 Subject: [PATCH 119/121] Update examples/server/server.cpp --- examples/server/server.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3a87f5116bfed..f283082672468 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -693,9 +693,10 @@ static void parse_options_completion(const json & body, llama_server_context & l llama.params.logit_bias[llama_token_eos()] = -INFINITY; } - if (body["logit_bias"].is_array()) { + const auto & logit_bias = body.find("logit_bias"); + if (logit_bias != body.end() && logit_bias->is_array()) { const int n_vocab = llama_n_vocab(llama.ctx); - for (const auto & el : body["logit_bias"]) { + for (const auto & el : *logit_bias) { if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { llama_token tok = el[0].get(); if (tok >= 0 && tok < n_vocab) { @@ -710,11 +711,13 @@ static void parse_options_completion(const json & body, llama_server_context & l } llama.params.antiprompt.clear(); - if (!body["stop"].is_null()) { - const auto stop = body["stop"].get>(); - std::copy_if(stop.begin(), stop.end(), - std::back_inserter(llama.params.antiprompt), - [](const std::string & str) { return !str.empty(); }); + const auto & stop = body.find("stop"); + if (stop != body.end() && stop->is_array()) { + for (const auto & word : *stop) { + if (!word.empty()) { + llama.params.antiprompt.push_back(word); + } + } } LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama)); From bd81096927a07c0372ec3f92725894151a418edb Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 14 Jun 2023 13:29:05 -0300 Subject: [PATCH 120/121] fix typo in readme + don't ignore integers --- examples/server/README.md | 2 +- examples/server/server.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 824fd5194273e..fbd2be8d4c053 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -154,7 +154,7 @@ node . `ignore_eos`: Ignore end of stream token and continue generating (default: false). - `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `logit-bias: [[15043,1]]` to increase the likelihood of the token 'Hello', or `logit-bias: [[15043,-1]]` to decrease its likelihood. Setting the value to false, `logit-bias: [[15043,false]]` ensures that the token `Hello` is never produced (default: []). + `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []). - **POST** `/tokenize`: Tokenize a given text. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f283082672468..8a1ad008664db 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -700,7 +700,7 @@ static void parse_options_completion(const json & body, llama_server_context & l if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { llama_token tok = el[0].get(); if (tok >= 0 && tok < n_vocab) { - if (el[1].is_number_float()) { + if (el[1].is_number()) { llama.params.logit_bias[tok] = el[1].get(); } else if (el[1].is_boolean() && !el[1].get()) { llama.params.logit_bias[tok] = -INFINITY; From aee859519ec4475b9572b0ca93868dbd75fd2b42 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Thu, 15 Jun 2023 01:50:54 -0700 Subject: [PATCH 121/121] Update README.md Fixed a typo. --- examples/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 93c38ce5deb27..474a28b20018f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -119,7 +119,7 @@ node . `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). - `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the the limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity). + `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity). `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.