18
18
#endif
19
19
20
20
using namespace httplib ;
21
- using json = nlohmann::json ;
21
+ using json = nlohmann::ordered_json ;
22
22
23
23
namespace {
24
24
@@ -543,7 +543,76 @@ int main(int argc, char ** argv) {
543
543
{" Access-Control-Allow-Origin" , " *" },
544
544
{" Access-Control-Allow-Headers" , " content-type" }});
545
545
546
- std::string const default_content = " <html>hello</html>" ;
546
+ std::string const default_content = R"(
547
+ <html>
548
+ <head>
549
+ <title>Whisper.cpp Server</title>
550
+ <meta charset="utf-8">
551
+ <meta name="viewport" content="width=device-width">
552
+ <style>
553
+ body {
554
+ font-family: sans-serif;
555
+ }
556
+ form {
557
+ display: flex;
558
+ flex-direction: column;
559
+ align-items: flex-start;
560
+ }
561
+ label {
562
+ margin-bottom: 0.5rem;
563
+ }
564
+ input, select {
565
+ margin-bottom: 1rem;
566
+ }
567
+ button {
568
+ margin-top: 1rem;
569
+ }
570
+ </style>
571
+ </head>
572
+ <body>
573
+ <h1>Whisper.cpp Server</h1>
574
+
575
+ <h2>/inference</h2>
576
+ <pre>
577
+ curl 127.0.0.1:)" + std::to_string (sparams.port ) + R"( /inference \
578
+ -H "Content-Type: multipart/form-data" \
579
+ -F file="@<file-path>" \
580
+ -F temperature="0.0" \
581
+ -F temperature_inc="0.2" \
582
+ -F response_format="json"
583
+ </pre>
584
+
585
+ <h2>/load</h2>
586
+ <pre>
587
+ curl 127.0.0.1:)" + std::to_string (sparams.port ) + R"( /load \
588
+ -H "Content-Type: multipart/form-data" \
589
+ -F model="<path-to-model-file>"
590
+ </pre>
591
+
592
+ <div>
593
+ <h2>Try it out</h2>
594
+ <form action="/inference" method="POST" enctype="multipart/form-data">
595
+ <label for="file">Choose an audio file:</label>
596
+ <input type="file" id="file" name="file" accept="audio/*" required><br>
597
+
598
+ <label for="temperature">Temperature:</label>
599
+ <input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
600
+
601
+ <label for="response_format">Response Format:</label>
602
+ <select id="response_format" name="response_format">
603
+ <option value="verbose_json">Verbose JSON</option>
604
+ <option value="json">JSON</option>
605
+ <option value="text">Text</option>
606
+ <option value="srt">SRT</option>
607
+ <option value="vtt">VTT</option>
608
+ </select><br>
609
+
610
+ <button type="submit">Submit</button>
611
+ </form>
612
+ </div>
613
+ </body>
614
+ </html>
615
+ )" ;
547
616
548
617
// store default params so we can reset after each inference request
549
618
whisper_params default_params = params;
@@ -556,15 +625,14 @@ int main(int argc, char ** argv) {
556
625
557
626
svr.Post (sparams.request_path + " /inference" , [&](const Request &req, Response &res){
558
627
// acquire whisper model mutex lock
559
- whisper_mutex. lock ();
628
+ std::lock_guard<std::mutex> lock (whisper_mutex );
560
629
561
630
// first check user requested fields of the request
562
631
if (!req.has_file (" file" ))
563
632
{
564
633
fprintf (stderr, " error: no 'file' field in the request\n " );
565
634
const std::string error_resp = " {\" error\" :\" no 'file' field in the request\" }" ;
566
635
res.set_content (error_resp, " application/json" );
567
- whisper_mutex.unlock ();
568
636
return ;
569
637
}
570
638
auto audio_file = req.get_file_value (" file" );
@@ -579,35 +647,42 @@ int main(int argc, char ** argv) {
579
647
std::vector<float > pcmf32; // mono-channel F32 PCM
580
648
std::vector<std::vector<float >> pcmf32s; // stereo-channel F32 PCM
581
649
582
- // write to temporary file
583
- const std::string temp_filename = " whisper_server_temp_file.wav" ;
584
- std::ofstream temp_file{temp_filename, std::ios::binary};
585
- temp_file << audio_file.content ;
586
- temp_file.close ();
587
-
588
- // if file is not wav, convert to wav
589
-
590
650
if (sparams.ffmpeg_converter ) {
651
+ // if file is not wav, convert to wav
652
+ // write to temporary file
653
+ const std::string temp_filename = " whisper_server_temp_file.wav" ;
654
+ std::ofstream temp_file{temp_filename, std::ios::binary};
655
+ temp_file << audio_file.content ;
656
+ temp_file.close ();
657
+
591
658
std::string error_resp = " {\" error\" :\" Failed to execute ffmpeg command.\" }" ;
592
659
const bool is_converted = convert_to_wav (temp_filename, error_resp);
593
660
if (!is_converted) {
594
661
res.set_content (error_resp, " application/json" );
595
- whisper_mutex.unlock ();
596
662
return ;
597
663
}
598
- }
599
664
600
- // read wav content into pcmf32
601
- if (!::read_wav (temp_filename, pcmf32, pcmf32s, params.diarize )) {
602
- fprintf (stderr, " error: failed to read WAV file '%s'\n " , temp_filename.c_str ());
603
- const std::string error_resp = " {\" error\" :\" failed to read WAV file\" }" ;
604
- res.set_content (error_resp, " application/json" );
665
+ // read wav content into pcmf32
666
+ if (!::read_wav (temp_filename, pcmf32, pcmf32s, params.diarize ))
667
+ {
668
+ fprintf (stderr, " error: failed to read WAV file '%s'\n " , temp_filename.c_str ());
669
+ const std::string error_resp = " {\" error\" :\" failed to read WAV file\" }" ;
670
+ res.set_content (error_resp, " application/json" );
671
+ std::remove (temp_filename.c_str ());
672
+ return ;
673
+ }
674
+ // remove temp file
605
675
std::remove (temp_filename.c_str ());
606
- whisper_mutex.unlock ();
607
- return ;
676
+ } else {
677
+ if (!::read_wav (audio_file.content , pcmf32, pcmf32s, params.diarize ))
678
+ {
679
+ fprintf (stderr, " error: failed to read WAV file\n " );
680
+ const std::string error_resp = " {\" error\" :\" failed to read WAV file\" }" ;
681
+ res.set_content (error_resp, " application/json" );
682
+ return ;
683
+ }
608
684
}
609
- // remove temp file
610
- std::remove (temp_filename.c_str ());
685
+
611
686
612
687
printf (" Successfully loaded %s\n " , filename.c_str ());
613
688
@@ -681,6 +756,7 @@ int main(int argc, char ** argv) {
681
756
wparams.logprob_thold = params.logprob_thold ;
682
757
683
758
wparams.no_timestamps = params.no_timestamps ;
759
+ wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
684
760
685
761
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
686
762
@@ -724,7 +800,6 @@ int main(int argc, char ** argv) {
724
800
fprintf (stderr, " %s: failed to process audio\n " , argv[0 ]);
725
801
const std::string error_resp = " {\" error\" :\" failed to process audio\" }" ;
726
802
res.set_content (error_resp, " application/json" );
727
- whisper_mutex.unlock ();
728
803
return ;
729
804
}
730
805
}
@@ -778,6 +853,59 @@ int main(int argc, char ** argv) {
778
853
ss << speaker << text << " \n\n " ;
779
854
}
780
855
res.set_content (ss.str (), " text/vtt" );
856
+ } else if (params.response_format == vjson_format) {
857
+ /* try to match openai/whisper's Python format */
858
+ std::string results = output_str (ctx, params, pcmf32s);
859
+ json jres = json{
860
+ {" task" , params.translate ? " translate" : " transcribe" },
861
+ {" language" , whisper_lang_str_full (whisper_full_lang_id (ctx))},
862
+ {" duration" , float (pcmf32.size ())/WHISPER_SAMPLE_RATE},
863
+ {" text" , results},
864
+ {" segments" , json::array ()}
865
+ };
866
+ const int n_segments = whisper_full_n_segments (ctx);
867
+ for (int i = 0 ; i < n_segments; ++i)
868
+ {
869
+ json segment = json{
870
+ {" id" , i},
871
+ {" text" , whisper_full_get_segment_text (ctx, i)},
872
+ };
873
+
874
+ if (!params.no_timestamps ) {
875
+ segment[" start" ] = whisper_full_get_segment_t0 (ctx, i) * 0.01 ;
876
+ segment[" end" ] = whisper_full_get_segment_t1 (ctx, i) * 0.01 ;
877
+ }
878
+
879
+ float total_logprob = 0 ;
880
+ const int n_tokens = whisper_full_n_tokens (ctx, i);
881
+ for (int j = 0 ; j < n_tokens; ++j) {
882
+ whisper_token_data token = whisper_full_get_token_data (ctx, i, j);
883
+ if (token.id >= whisper_token_eot (ctx)) {
884
+ continue ;
885
+ }
886
+
887
+ segment[" tokens" ].push_back (token.id );
888
+ json word = json{{" word" , whisper_full_get_token_text (ctx, i, j)}};
889
+ if (!params.no_timestamps ) {
890
+ word[" start" ] = token.t0 * 0.01 ;
891
+ word[" end" ] = token.t1 * 0.01 ;
892
+ }
893
+ word[" probability" ] = token.p ;
894
+ total_logprob += token.plog ;
895
+ segment[" words" ].push_back (word);
896
+ }
897
+
898
+ segment[" temperature" ] = params.temperature ;
899
+ segment[" avg_logprob" ] = total_logprob / n_tokens;
900
+
901
+ // TODO compression_ratio and no_speech_prob are not implemented yet
902
+ // segment["compression_ratio"] = 0;
903
+ // segment["no_speech_prob"] = 0;
904
+
905
+ jres[" segments" ].push_back (segment);
906
+ }
907
+ res.set_content (jres.dump (-1 , ' ' , false , json::error_handler_t ::replace),
908
+ " application/json" );
781
909
}
782
910
// TODO add more output formats
783
911
else
@@ -792,18 +920,14 @@ int main(int argc, char ** argv) {
792
920
793
921
// reset params to thier defaults
794
922
params = default_params;
795
-
796
- // return whisper model mutex lock
797
- whisper_mutex.unlock ();
798
923
});
799
924
svr.Post (sparams.request_path + " /load" , [&](const Request &req, Response &res){
800
- whisper_mutex. lock ();
925
+ std::lock_guard<std::mutex> lock (whisper_mutex );
801
926
if (!req.has_file (" model" ))
802
927
{
803
928
fprintf (stderr, " error: no 'model' field in the request\n " );
804
929
const std::string error_resp = " {\" error\" :\" no 'model' field in the request\" }" ;
805
930
res.set_content (error_resp, " application/json" );
806
- whisper_mutex.unlock ();
807
931
return ;
808
932
}
809
933
std::string model = req.get_file_value (" model" ).content ;
@@ -812,7 +936,6 @@ int main(int argc, char ** argv) {
812
936
fprintf (stderr, " error: 'model': %s not found!\n " , model.c_str ());
813
937
const std::string error_resp = " {\" error\" :\" model not found!\" }" ;
814
938
res.set_content (error_resp, " application/json" );
815
- whisper_mutex.unlock ();
816
939
return ;
817
940
}
818
941
@@ -835,7 +958,6 @@ int main(int argc, char ** argv) {
835
958
res.set_content (success, " application/text" );
836
959
837
960
// check if the model is in the file system
838
- whisper_mutex.unlock ();
839
961
});
840
962
841
963
svr.set_exception_handler ([](const Request &, Response &res, std::exception_ptr ep) {
0 commit comments