Skip to content

Commit b63bd01

Browse files
authored
server : add fields to verbose_json response (ggml-org#1802)
* server: include additional fields in the verbose_json response as OpenAI does * server: show request examples on home page * server: todo note for compression_ratio and no_speech_prob * server: add simple demo form to the homepage
1 parent 2096ceb commit b63bd01

File tree

1 file changed

+87
-2
lines changed

1 file changed

+87
-2
lines changed

examples/server/server.cpp

Lines changed: 87 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,76 @@ int main(int argc, char ** argv) {
543543
{"Access-Control-Allow-Origin", "*"},
544544
{"Access-Control-Allow-Headers", "content-type"}});
545545

546-
std::string const default_content = "<html>hello</html>";
546+
std::string const default_content = R"(
547+
<html>
548+
<head>
549+
<title>Whisper.cpp Server</title>
550+
<meta charset="utf-8">
551+
<meta name="viewport" content="width=device-width">
552+
<style>
553+
body {
554+
font-family: sans-serif;
555+
}
556+
form {
557+
display: flex;
558+
flex-direction: column;
559+
align-items: flex-start;
560+
}
561+
label {
562+
margin-bottom: 0.5rem;
563+
}
564+
input, select {
565+
margin-bottom: 1rem;
566+
}
567+
button {
568+
margin-top: 1rem;
569+
}
570+
</style>
571+
</head>
572+
<body>
573+
<h1>Whisper.cpp Server</h1>
574+
575+
<h2>/inference</h2>
576+
<pre>
577+
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
578+
-H "Content-Type: multipart/form-data" \
579+
-F file="@&lt;file-path&gt;" \
580+
-F temperature="0.0" \
581+
-F temperature_inc="0.2" \
582+
-F response_format="json"
583+
</pre>
584+
585+
<h2>/load</h2>
586+
<pre>
587+
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
588+
-H "Content-Type: multipart/form-data" \
589+
-F model="&lt;path-to-model-file&gt;"
590+
</pre>
591+
592+
<div>
593+
<h2>Try it out</h2>
594+
<form action="/inference" method="POST" enctype="multipart/form-data">
595+
<label for="file">Choose an audio file:</label>
596+
<input type="file" id="file" name="file" accept="audio/*" required><br>
597+
598+
<label for="temperature">Temperature:</label>
599+
<input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
600+
601+
<label for="response_format">Response Format:</label>
602+
<select id="response_format" name="response_format">
603+
<option value="verbose_json">Verbose JSON</option>
604+
<option value="json">JSON</option>
605+
<option value="text">Text</option>
606+
<option value="srt">SRT</option>
607+
<option value="vtt">VTT</option>
608+
</select><br>
609+
610+
<button type="submit">Submit</button>
611+
</form>
612+
</div>
613+
</body>
614+
</html>
615+
)";
547616

548617
// store default params so we can reset after each inference request
549618
whisper_params default_params = params;
@@ -787,7 +856,13 @@ int main(int argc, char ** argv) {
787856
} else if (params.response_format == vjson_format) {
788857
/* try to match openai/whisper's Python format */
789858
std::string results = output_str(ctx, params, pcmf32s);
790-
json jres = json{{"text", results}};
859+
json jres = json{
860+
{"task", params.translate ? "translate" : "transcribe"},
861+
{"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
862+
{"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
863+
{"text", results},
864+
{"segments", json::array()}
865+
};
791866
const int n_segments = whisper_full_n_segments(ctx);
792867
for (int i = 0; i < n_segments; ++i)
793868
{
@@ -801,6 +876,7 @@ int main(int argc, char ** argv) {
801876
segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
802877
}
803878

879+
float total_logprob = 0;
804880
const int n_tokens = whisper_full_n_tokens(ctx, i);
805881
for (int j = 0; j < n_tokens; ++j) {
806882
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
@@ -815,8 +891,17 @@ int main(int argc, char ** argv) {
815891
word["end"] = token.t1 * 0.01;
816892
}
817893
word["probability"] = token.p;
894+
total_logprob += token.plog;
818895
segment["words"].push_back(word);
819896
}
897+
898+
segment["temperature"] = params.temperature;
899+
segment["avg_logprob"] = total_logprob / n_tokens;
900+
901+
// TODO compression_ratio and no_speech_prob are not implemented yet
902+
// segment["compression_ratio"] = 0;
903+
// segment["no_speech_prob"] = 0;
904+
820905
jres["segments"].push_back(segment);
821906
}
822907
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),

0 commit comments

Comments
 (0)