From a0a08eedb6a23b31d8783bbb91ede583cbe7933a Mon Sep 17 00:00:00 2001 From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com> Date: Wed, 22 Nov 2023 02:16:38 -0300 Subject: [PATCH 1/5] Add openai-compatible POST /v1/chat/completions API endpoint to server example --- examples/server/server.cpp | 347 ++++++++++++++++++++++++++++++++++++- 1 file changed, 346 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1f2c55f2dccdf..25c23d30bd65a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -29,6 +29,8 @@ #define SERVER_VERBOSE 1 #endif +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" + using json = nlohmann::json; struct server_params @@ -63,6 +65,10 @@ static bool server_verbose = false; // base64 utils (TODO: move to common in the future) // +nlohmann::json oaicompat_completion_params_parse( + const nlohmann::json &body); +std::string format_chatml(std::vector messages); + static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" @@ -377,6 +383,9 @@ struct llama_client_slot bool stopped_eos = false; bool stopped_word = false; bool stopped_limit = false; + + bool oaicompat = false; + std::string oaicompat_model = ""; std::string stopping_word; @@ -676,7 +685,16 @@ struct llama_server_context bool launch_slot_with_data(llama_client_slot* &slot, json data) { slot_params default_params; llama_sampling_params default_sparams; - + + if (data.count("__oaicompat") != 0) { + slot->oaicompat = true; + slot->oaicompat_model = + json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + } else { + slot->oaicompat = false; + slot->oaicompat_model = ""; + } + slot->params.stream = json_value(data, "stream", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false); slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); @@ -1169,6 +1187,12 @@ struct llama_server_context res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); } + if (slot.oaicompat) + { + res.result_json["oaicompat_token_ctr"] = slot.n_decoded; + res.result_json["model"] = slot.oaicompat_model; + } + queue_results.push_back(res); } @@ -1216,6 +1240,12 @@ struct llama_server_context res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs); } + if (slot.oaicompat) + { + res.result_json["oaicompat_token_ctr"] = slot.n_decoded; + res.result_json["model"] = slot.oaicompat_model; + } + queue_results.push_back(res); } @@ -2178,6 +2208,249 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } } + +static std::string random_string() { + std::string str( + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + std::random_device rd; + std::mt19937 generator(rd()); + + std::shuffle(str.begin(), str.end(), generator); + + return str.substr(0, 32); // assumes 32 < number of characters in str +} + +static std::string gen_chatcmplid() { + std::stringstream chatcmplid; + chatcmplid << "chatcmpl-" << random_string(); + return chatcmplid.str(); +} + +std::string format_chatml(std::vector messages) { + + std::ostringstream chatml_msgs; + + // iterate the array + for (auto it = messages.begin(); it != messages.end(); ++it) { + chatml_msgs << "<|im_start|>" + << json_value(*it, "role", std::string("user")) << '\n'; + chatml_msgs << json_value(*it, "content", std::string("")) + << "<|im_end|>\n"; + } + + chatml_msgs << "<|im_start|>assistant" << '\n'; + + return chatml_msgs.str(); +} + +/* llama.cpp completion api semantics */ +nlohmann::json oaicompat_completion_params_parse( + const nlohmann::json &body /* openai api json semantics */) { + nlohmann::json llama_params; + + llama_params["__oaicompat"] = true; + + // Map OpenAI parameters to llama.cpp parameters + llama_params["prompt"] = format_chatml( + body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' + llama_params["temperature"] = + json_value(body, "temperature", 0.8); // Default to 0.8 if not provided + llama_params["top_k"] = + json_value(body, "max_tokens", 40); // Default to 40 if not provided + llama_params["top_p"] = + json_value(body, "top_p", 0.95); // Default to 0.95 if not provided + llama_params["n_predict"] = + json_value(body, "max_tokens", -1); // Default to -1 if not provided + llama_params["logit_bias"] = json_value( + body, "logit_bias", + nlohmann::json::object()); // Default to empty object if not provided + llama_params["frequency_penalty"] = json_value( + body, "frequency_penalty", 0.0); // Default to 0.0 if not provided + llama_params["presence_penalty"] = json_value( + body, "presence_penalty", 0.0); // Default to 0.0 if not provided + llama_params["seed"] = json_value(body, "seed", 0); + llama_params["stream"] = + json_value(body, "stream", false); // Default to 0 if not provided + llama_params["mirostat"] = + json_value(body, "mirostat", false); // Default to false if not provided + llama_params["mirostat_tau"] = + json_value(body, "mirostat_tau", 0.0); // Default to 0.0 if not provided + llama_params["mirostat_eta"] = + json_value(body, "mirostat_eta", 0.0); // Default to 0.0 if not provided + llama_params["penalize_nl"] = json_value( + body, "penalize_nl", false); // Default to false if not provided + llama_params["typical_p"] = + json_value(body, "typical_p", 0.0); // Default to 0.0 if not provided + llama_params["repeat_last_n"] = + json_value(body, "repeat_last_n", 0); // Default to 0 if not provided + llama_params["ignore_eos"] = + json_value(body, "ignore_eos", false); // Default to false if not provided + llama_params["tfs_z"] = + json_value(body, "tfs_z", 0.0); // Default to 0.0 if not provided + if (llama_params.count("grammar") != 0) { + llama_params["grammar"] = json_value( + body, "grammar", + nlohmann::json::object()); // Default to empty object if not provided + } + + // Handle 'stop' field + if (body["stop"].is_null()) { + llama_params["stop"] = json::array({}); + } else if (body["stop"].is_string()) { + llama_params["stop"] = json::array({body["stop"].get()}); + } else { + llama_params["stop"] = json_value( + body, "stop", + json::array()); // Default to empty array if not provided + } + + llama_params["stop"].push_back("<|im_end|>"); + + return llama_params; +} + +static json format_final_response_oaicompat(json request, task_result response, + bool streaming = false) { + + json result = response.result_json; + + bool stopped_word = result.count("stopped_word") != 0; + bool stopped_eos = json_value(result, "stopped_eos", false); + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason = "length"; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + + json choices = + streaming ? json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}) + : json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{{"content", content}, + {"role", "assistant"}}}}}); + + std::time_t t = std::time(0); + + json res = + json{{"choices", choices}, + {"created", t}, + {"model", + json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, + {"usage", + json{{"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, + {"id", gen_chatcmplid()}}; + + if (server_verbose) { + res["__verbose"] = result; + } + + if (result.contains("completion_probabilities")) { + res["completion_probabilities"] = + json_value(result, "completion_probabilities", json::array()); + } + + return res; +} + +static std::vector format_partial_response_oaicompat(task_result response) { + json result = response.result_json; + + if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { + return std::vector({response.result_json}); + } + + bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; + std::string modelname = + json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + + bool stopped_word = json_value(result, "stopped_word", false); + bool stopped_eos = json_value(result, "stopped_eos", false); + bool stopped_limit = json_value(result, "stopped_limit", false); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason = ""; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + if (stopped_limit) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", + json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{{"choices", + json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}}}})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + return std::vector({initial_ret, second_ret}); + } + } else { + // Some idosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector({json::object()}); + } + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json{{"choices", choices}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector({ret}); +} + static json format_partial_response( llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector &probs ) { @@ -2396,6 +2669,78 @@ int main(int argc, char **argv) } }); + + svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, + httplib::Response &res) { + json data = oaicompat_completion_params_parse(json::parse(req.body)); + + const int task_id = llama.request_completion(data, false, false); + if (!json_value(data, "stream", false)) { + std::string completion_text; + task_result result = llama.next_result(task_id); + + if (!result.error && result.stop) { + json oaicompat_result = format_final_response_oaicompat(data, result); + + res.set_content(oaicompat_result.dump(-1, ' ', false, + json::error_handler_t::replace), + "application/json"); + } else { + res.status = 500; + res.set_content(result.result_json["content"], "text/plain"); + return; + } + } else { + const auto chunked_content_provider = [task_id, &llama](size_t, + httplib::DataSink &sink) { + while (true) { + task_result llama_result = llama.next_result(task_id); + if (!llama_result.error) { + std::vector result_array = format_partial_response_oaicompat( llama_result); + + for (auto it = result_array.begin(); it != result_array.end(); ++it) + { + if (!it->empty()) { + const std::string str = + "data: " + + it->dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", {{"to_send", str}}); + if (!sink.write(str.c_str(), str.size())) { + return false; + } + } + } + if (llama_result.stop) { + break; + } + } else { + const std::string str = + "error: " + + llama_result.result_json.dump(-1, ' ', false, + json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", {{"to_send", str}}); + if (!sink.write(str.c_str(), str.size())) { + return false; + } + break; + } + } + sink.done(); + return true; + }; + + auto on_complete = [task_id, &llama](bool) { + // cancel + llama.request_cancel(task_id); + }; + + res.set_chunked_content_provider("text/event-stream", + chunked_content_provider, on_complete); + } + }); + svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) { json data = json::parse(req.body); From 2f84f5dc84ba7e190309f87193592468eeb4cd4d Mon Sep 17 00:00:00 2001 From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com> Date: Wed, 22 Nov 2023 02:40:47 -0300 Subject: [PATCH 2/5] fix code style --- examples/server/server.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 25c23d30bd65a..98552a83139a4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -65,8 +65,8 @@ static bool server_verbose = false; // base64 utils (TODO: move to common in the future) // -nlohmann::json oaicompat_completion_params_parse( - const nlohmann::json &body); +json oaicompat_completion_params_parse( + const json &body); std::string format_chatml(std::vector messages); static const std::string base64_chars = @@ -2245,9 +2245,9 @@ std::string format_chatml(std::vector messages) { } /* llama.cpp completion api semantics */ -nlohmann::json oaicompat_completion_params_parse( - const nlohmann::json &body /* openai api json semantics */) { - nlohmann::json llama_params; +json oaicompat_completion_params_parse( + const json &body /* openai api json semantics */) { + json llama_params; llama_params["__oaicompat"] = true; @@ -2264,7 +2264,7 @@ nlohmann::json oaicompat_completion_params_parse( json_value(body, "max_tokens", -1); // Default to -1 if not provided llama_params["logit_bias"] = json_value( body, "logit_bias", - nlohmann::json::object()); // Default to empty object if not provided + json::object()); // Default to empty object if not provided llama_params["frequency_penalty"] = json_value( body, "frequency_penalty", 0.0); // Default to 0.0 if not provided llama_params["presence_penalty"] = json_value( @@ -2291,7 +2291,7 @@ nlohmann::json oaicompat_completion_params_parse( if (llama_params.count("grammar") != 0) { llama_params["grammar"] = json_value( body, "grammar", - nlohmann::json::object()); // Default to empty object if not provided + json::object()); // Default to empty object if not provided } // Handle 'stop' field From af4d68b22d28e9c3bb6fc8467a8872649840adac Mon Sep 17 00:00:00 2001 From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com> Date: Wed, 22 Nov 2023 03:55:23 -0300 Subject: [PATCH 3/5] Update server README.md --- examples/server/README.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/examples/server/README.md b/examples/server/README.md index a6eda3b32d576..be13529fc03bb 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -234,6 +234,39 @@ node index.js - **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots. +- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. + + *Options:* + + See (OpenAI Chat Completions API documentation)[https://platform.openai.com/docs/api-reference/chat]. While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported. + + *Examples:* + + You can use either Python `openai` library with appropriate checkpoints, or raw HTTP requests: + + ```python + openai.api_base = "http://:port" + ``` + + ```shell + curl http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "system", + "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests." + }, + { + "role": "user", + "content": "Write a limerick about python exceptions" + } + ] + }' + ``` + ## More examples ### Change system prompt on runtime From 9ad4d273e11deb2ebed201d846bb97ae0129320e Mon Sep 17 00:00:00 2001 From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com> Date: Wed, 22 Nov 2023 04:17:12 -0300 Subject: [PATCH 4/5] Improve server README.md --- examples/server/README.md | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index be13529fc03bb..cfc220f5810b3 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -234,19 +234,35 @@ node index.js - **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots. -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. +- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. *Options:* - See (OpenAI Chat Completions API documentation)[https://platform.openai.com/docs/api-reference/chat]. While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported. + See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported. *Examples:* - You can use either Python `openai` library with appropriate checkpoints, or raw HTTP requests: + You can use either Python `openai` library with appropriate checkpoints: ```python - openai.api_base = "http://:port" + import openai + + client = openai.OpenAI( + base_url="http://localhost:8080/v1", # "http://:port" + api_key = "sk-no-key-required" + ) + + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."}, + {"role": "user", "content": "Write a limerick about python exceptions"} + ] + ) + + print(completion.choices[0].message) ``` + ... or raw HTTP requests: ```shell curl http://localhost:8080/v1/chat/completions \ From e1516709f217ac8d342b28e9b9a2c0e74b57310b Mon Sep 17 00:00:00 2001 From: kir-gadjello <111190790+kir-gadjello@users.noreply.github.com> Date: Wed, 22 Nov 2023 22:35:57 -0300 Subject: [PATCH 5/5] Fix server.cpp code style according to review --- examples/server/server.cpp | 549 ++++++++++++++++++------------------- 1 file changed, 267 insertions(+), 282 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 98552a83139a4..54455ad9a7367 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -61,14 +61,14 @@ static bool server_verbose = false; #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +json oaicompat_completion_params_parse(const json &body); +std::string format_chatml(std::vector messages); + + // // base64 utils (TODO: move to common in the future) // -json oaicompat_completion_params_parse( - const json &body); -std::string format_chatml(std::vector messages); - static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" @@ -688,8 +688,7 @@ struct llama_server_context if (data.count("__oaicompat") != 0) { slot->oaicompat = true; - slot->oaicompat_model = - json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); } else { slot->oaicompat = false; slot->oaicompat_model = ""; @@ -2209,246 +2208,232 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } -static std::string random_string() { - std::string str( - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); +static std::string random_string() +{ + std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - std::random_device rd; - std::mt19937 generator(rd()); + std::random_device rd; + std::mt19937 generator(rd()); - std::shuffle(str.begin(), str.end(), generator); + std::shuffle(str.begin(), str.end(), generator); - return str.substr(0, 32); // assumes 32 < number of characters in str + return str.substr(0, 32); // assumes 32 < number of characters in str } -static std::string gen_chatcmplid() { - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - return chatcmplid.str(); +static std::string gen_chatcmplid() +{ + std::stringstream chatcmplid; + chatcmplid << "chatcmpl-" << random_string(); + return chatcmplid.str(); } -std::string format_chatml(std::vector messages) { - - std::ostringstream chatml_msgs; +std::string format_chatml(std::vector messages) +{ + std::ostringstream chatml_msgs; - // iterate the array - for (auto it = messages.begin(); it != messages.end(); ++it) { - chatml_msgs << "<|im_start|>" - << json_value(*it, "role", std::string("user")) << '\n'; - chatml_msgs << json_value(*it, "content", std::string("")) - << "<|im_end|>\n"; - } + for (auto it = messages.begin(); it != messages.end(); ++it) { + chatml_msgs << "<|im_start|>" + << json_value(*it, "role", std::string("user")) << '\n'; + chatml_msgs << json_value(*it, "content", std::string("")) + << "<|im_end|>\n"; + } - chatml_msgs << "<|im_start|>assistant" << '\n'; + chatml_msgs << "<|im_start|>assistant" << '\n'; - return chatml_msgs.str(); + return chatml_msgs.str(); } /* llama.cpp completion api semantics */ json oaicompat_completion_params_parse( - const json &body /* openai api json semantics */) { - json llama_params; - - llama_params["__oaicompat"] = true; - - // Map OpenAI parameters to llama.cpp parameters - llama_params["prompt"] = format_chatml( - body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' - llama_params["temperature"] = - json_value(body, "temperature", 0.8); // Default to 0.8 if not provided - llama_params["top_k"] = - json_value(body, "max_tokens", 40); // Default to 40 if not provided - llama_params["top_p"] = - json_value(body, "top_p", 0.95); // Default to 0.95 if not provided - llama_params["n_predict"] = - json_value(body, "max_tokens", -1); // Default to -1 if not provided - llama_params["logit_bias"] = json_value( - body, "logit_bias", - json::object()); // Default to empty object if not provided - llama_params["frequency_penalty"] = json_value( - body, "frequency_penalty", 0.0); // Default to 0.0 if not provided - llama_params["presence_penalty"] = json_value( - body, "presence_penalty", 0.0); // Default to 0.0 if not provided - llama_params["seed"] = json_value(body, "seed", 0); - llama_params["stream"] = - json_value(body, "stream", false); // Default to 0 if not provided - llama_params["mirostat"] = - json_value(body, "mirostat", false); // Default to false if not provided - llama_params["mirostat_tau"] = - json_value(body, "mirostat_tau", 0.0); // Default to 0.0 if not provided - llama_params["mirostat_eta"] = - json_value(body, "mirostat_eta", 0.0); // Default to 0.0 if not provided - llama_params["penalize_nl"] = json_value( - body, "penalize_nl", false); // Default to false if not provided - llama_params["typical_p"] = - json_value(body, "typical_p", 0.0); // Default to 0.0 if not provided - llama_params["repeat_last_n"] = - json_value(body, "repeat_last_n", 0); // Default to 0 if not provided - llama_params["ignore_eos"] = - json_value(body, "ignore_eos", false); // Default to false if not provided - llama_params["tfs_z"] = - json_value(body, "tfs_z", 0.0); // Default to 0.0 if not provided - if (llama_params.count("grammar") != 0) { - llama_params["grammar"] = json_value( - body, "grammar", - json::object()); // Default to empty object if not provided - } - - // Handle 'stop' field - if (body["stop"].is_null()) { - llama_params["stop"] = json::array({}); - } else if (body["stop"].is_string()) { - llama_params["stop"] = json::array({body["stop"].get()}); - } else { - llama_params["stop"] = json_value( - body, "stop", - json::array()); // Default to empty array if not provided - } - - llama_params["stop"].push_back("<|im_end|>"); - - return llama_params; + const json &body /* openai api json semantics */) +{ + json llama_params; + + llama_params["__oaicompat"] = true; + + // Map OpenAI parameters to llama.cpp parameters + llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' + llama_params["temperature"] = json_value(body, "temperature", 0.8); + llama_params["top_k"] = json_value(body, "max_tokens", 40); + llama_params["top_p"] = json_value(body, "top_p", 0.95); + llama_params["n_predict"] = json_value(body, "max_tokens", -1); + llama_params["logit_bias"] = json_value(body, "logit_bias",json::object()); + llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); + llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); + llama_params["seed"] = json_value(body, "seed", 0); + llama_params["stream"] =json_value(body, "stream", false); + llama_params["mirostat"] = json_value(body, "mirostat", false); + llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0); + llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0); + llama_params["penalize_nl"] = json_value(body, "penalize_nl", false); + llama_params["typical_p"] = json_value(body, "typical_p", 0.0); + llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0); + llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); + llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0); + + if (llama_params.count("grammar") != 0) { + llama_params["grammar"] = json_value( + body, "grammar", + json::object()); + } + + // Handle 'stop' field + if (body["stop"].is_null()) { + llama_params["stop"] = json::array({}); + } else if (body["stop"].is_string()) { + llama_params["stop"] = json::array({body["stop"].get()}); + } else { + llama_params["stop"] = json_value( + body, "stop", + json::array()); + } + + // Ensure there is ChatML-specific end sequence among stop words + llama_params["stop"].push_back("<|im_end|>"); + + return llama_params; } static json format_final_response_oaicompat(json request, task_result response, - bool streaming = false) { + bool streaming = false) +{ + json result = response.result_json; + + bool stopped_word = result.count("stopped_word") != 0; + bool stopped_eos = json_value(result, "stopped_eos", false); + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + std::string content = json_value(result, "content", std::string("")); - json result = response.result_json; + std::string finish_reason = "length"; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } - bool stopped_word = result.count("stopped_word") != 0; - bool stopped_eos = json_value(result, "stopped_eos", false); - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - std::string content = json_value(result, "content", std::string("")); + json choices = + streaming ? json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}) + : json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{{"content", content}, + {"role", "assistant"}}}}}); + + std::time_t t = std::time(0); + + json res = + json{{"choices", choices}, + {"created", t}, + {"model", + json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, + {"usage", + json{{"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, + {"id", gen_chatcmplid()}}; + + if (server_verbose) { + res["__verbose"] = result; + } - std::string finish_reason = "length"; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } + if (result.contains("completion_probabilities")) { + res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); + } - json choices = - streaming ? json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}) - : json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{{"content", content}, - {"role", "assistant"}}}}}); - - std::time_t t = std::time(0); - - json res = - json{{"choices", choices}, - {"created", t}, - {"model", - json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, - {"usage", - json{{"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, - {"id", gen_chatcmplid()}}; - - if (server_verbose) { - res["__verbose"] = result; - } - - if (result.contains("completion_probabilities")) { - res["completion_probabilities"] = - json_value(result, "completion_probabilities", json::array()); - } - - return res; + return res; } +// return value is vector as there is one case where we might need to generate two responses static std::vector format_partial_response_oaicompat(task_result response) { - json result = response.result_json; - - if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { - return std::vector({response.result_json}); - } - - bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; - std::string modelname = - json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - - bool stopped_word = json_value(result, "stopped_word", false); - bool stopped_eos = json_value(result, "stopped_eos", false); - bool stopped_limit = json_value(result, "stopped_limit", false); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason = ""; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - if (stopped_limit) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, + json result = response.result_json; + + if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { + return std::vector({response.result_json}); + } + + bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; + std::string modelname = + json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + + bool stopped_word = json_value(result, "stopped_word", false); + bool stopped_eos = json_value(result, "stopped_eos", false); + bool stopped_limit = json_value(result, "stopped_limit", false); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason = ""; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + if (stopped_limit) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", - json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{{"choices", - json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}}}})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - return std::vector({initial_ret, second_ret}); - } + {"delta", json::object()}}}); } else { - // Some idosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); + } + } else { + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector({json::object()}); + } + + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } } - } - json ret = json{{"choices", choices}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({ret}); + json ret = json{{"choices", choices}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector({ret}); } static json format_partial_response( @@ -2670,76 +2655,76 @@ int main(int argc, char **argv) }); - svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, - httplib::Response &res) { - json data = oaicompat_completion_params_parse(json::parse(req.body)); + svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, + httplib::Response &res) + { + json data = oaicompat_completion_params_parse(json::parse(req.body)); - const int task_id = llama.request_completion(data, false, false); - if (!json_value(data, "stream", false)) { - std::string completion_text; - task_result result = llama.next_result(task_id); + const int task_id = llama.request_completion(data, false, false); + + if (!json_value(data, "stream", false)) { + std::string completion_text; + task_result result = llama.next_result(task_id); - if (!result.error && result.stop) { - json oaicompat_result = format_final_response_oaicompat(data, result); + if (!result.error && result.stop) { + json oaicompat_result = format_final_response_oaicompat(data, result); - res.set_content(oaicompat_result.dump(-1, ' ', false, - json::error_handler_t::replace), - "application/json"); - } else { - res.status = 500; - res.set_content(result.result_json["content"], "text/plain"); - return; - } - } else { - const auto chunked_content_provider = [task_id, &llama](size_t, - httplib::DataSink &sink) { - while (true) { - task_result llama_result = llama.next_result(task_id); - if (!llama_result.error) { - std::vector result_array = format_partial_response_oaicompat( llama_result); - - for (auto it = result_array.begin(); it != result_array.end(); ++it) - { - if (!it->empty()) { - const std::string str = - "data: " + - it->dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; - LOG_VERBOSE("data stream", {{"to_send", str}}); - if (!sink.write(str.c_str(), str.size())) { - return false; + res.set_content(oaicompat_result.dump(-1, ' ', false, + json::error_handler_t::replace), + "application/json"); + } else { + res.status = 500; + res.set_content(result.result_json["content"], "text/plain"); + return; } - } - } - if (llama_result.stop) { - break; - } - } else { - const std::string str = - "error: " + - llama_result.result_json.dump(-1, ' ', false, - json::error_handler_t::replace) + - "\n\n"; - LOG_VERBOSE("data stream", {{"to_send", str}}); - if (!sink.write(str.c_str(), str.size())) { - return false; - } - break; - } - } - sink.done(); - return true; - }; + } else { + const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) { + while (true) { + task_result llama_result = llama.next_result(task_id); + if (!llama_result.error) { + std::vector result_array = format_partial_response_oaicompat( llama_result); + + for (auto it = result_array.begin(); it != result_array.end(); ++it) + { + if (!it->empty()) { + const std::string str = + "data: " + + it->dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", {{"to_send", str}}); + if (!sink.write(str.c_str(), str.size())) { + return false; + } + } + } + if (llama_result.stop) { + break; + } + } else { + const std::string str = + "error: " + + llama_result.result_json.dump(-1, ' ', false, + json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", {{"to_send", str}}); + if (!sink.write(str.c_str(), str.size())) { + return false; + } + break; + } + } + sink.done(); + return true; + }; - auto on_complete = [task_id, &llama](bool) { - // cancel - llama.request_cancel(task_id); - }; + auto on_complete = [task_id, &llama](bool) { + // cancel request + llama.request_cancel(task_id); + }; - res.set_chunked_content_provider("text/event-stream", - chunked_content_provider, on_complete); - } - }); + res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); + } + }); svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) {