Skip to content

Commit 59bbd71

Browse files
committed
Merge remote-tracking branch 'ggerganov/master'
* ggerganov/master: (60 commits) sync : ggml (#0) ggml : fix IQ3_XXS on Metal (llama/5219) sync : ggml (llama/0) Faster AVX2 dot product for IQ2_XS (llama/5187) SOTA 3-bit quants (llama/5196) ggml alloc: Fix for null dereference on alloc failure (llama/5200) Nomic Vulkan backend (llama/4456) ggml : add max buffer sizes to opencl and metal backends (llama/5181) metal : free metal objects (llama/5161) gguf : fix comparison (ggml/715) `ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686) gguf : add input validation, prevent integer overflows (ggml/709) ci : fix yolo URLs + fix metal capture (ggml/712) metal : add debug capture backend function (ggml/694) common : fix wav buffer detection (ggml-org#1819) server : add fields to `verbose_json` response (ggml-org#1802) make : update MSYS_NT (ggml-org#1813) talk-llama : sync llama.cpp sync : ggml ggml : add Vulkan backend (llama/2059) ...
2 parents eda6990 + 7a74e92 commit 59bbd71

33 files changed

+6811
-2939
lines changed

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,8 @@ if (WHISPER_ALL_WARNINGS)
320320
endif()
321321

322322
if (NOT MSVC)
323-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
323+
# TODO: temporary disabled until we figure out ggml-metal.m
324+
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
324325
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
325326
endif()
326327

@@ -509,6 +510,7 @@ else()
509510
endif()
510511

511512
if (BUILD_SHARED_LIBS)
513+
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
512514
target_link_libraries(${TARGET} PUBLIC
513515
${CMAKE_DL_LIBS}
514516
)

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
117117
CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
118118
else ifeq ($(UNAME_S),Linux)
119119
CPUINFO_CMD := cat /proc/cpuinfo
120-
else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
120+
else ifneq (,$(filter MINGW32_NT% MINGW64_NT% MSYS_NT%,$(UNAME_S)))
121121
CPUINFO_CMD := cat /proc/cpuinfo
122122
else ifneq (,$(filter DragonFly FreeBSD,$(UNAME_S)))
123123
CPUINFO_CMD := grep Features /var/run/dmesg.boot

README.md

Lines changed: 73 additions & 67 deletions
Large diffs are not rendered by default.

bindings/javascript/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ make publish-npm
4141

4242
## Sample run
4343

44-
```java
44+
```text
4545
$ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
4646
4747
whisper_model_load: loading model from 'whisper.bin'
@@ -63,7 +63,7 @@ whisper_model_load: ggml ctx size = 140.60 MB
6363
whisper_model_load: memory size = 22.83 MB
6464
whisper_model_load: model size = 140.54 MB
6565
66-
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
66+
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
6767
6868
operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...
6969

examples/common-ggml.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ bool ggml_common_quantize_0(
6464
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
6565
case GGML_FTYPE_MOSTLY_IQ2_XXS:
6666
case GGML_FTYPE_MOSTLY_IQ2_XS:
67+
case GGML_FTYPE_MOSTLY_IQ3_XXS:
6768
{
6869
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
6970
return false;
@@ -195,6 +196,7 @@ bool ggml_common_quantize_0(
195196
case GGML_TYPE_Q8_K:
196197
case GGML_TYPE_IQ2_XXS:
197198
case GGML_TYPE_IQ2_XS:
199+
case GGML_TYPE_IQ3_XXS:
198200
case GGML_TYPE_COUNT:
199201
{
200202
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));

examples/common.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,21 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
615615

616616
}
617617

618+
bool is_wav_buffer(const std::string buf) {
619+
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
620+
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
621+
if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
622+
return false;
623+
}
624+
625+
uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
626+
if (chunk_size + 8 != buf.size()) {
627+
return false;
628+
}
629+
630+
return true;
631+
}
632+
618633
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
619634
drwav wav;
620635
std::vector<uint8_t> wav_data; // used for pipe input from stdin
@@ -639,6 +654,12 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
639654

640655
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
641656
}
657+
else if (is_wav_buffer(fname)) {
658+
if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
659+
fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
660+
return false;
661+
}
662+
}
642663
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
643664
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
644665
return false;

examples/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,11 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
135135
// Audio utils
136136
//
137137

138+
// Check if a buffer is a WAV audio file
139+
bool is_wav_buffer(const std::string buf);
140+
138141
// Read WAV audio file and store the PCM data into pcmf32
142+
// fname can be a buffer of WAV data instead of a filename
139143
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
140144
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
141145
bool read_wav(

examples/server/server.cpp

Lines changed: 153 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#endif
1919

2020
using namespace httplib;
21-
using json = nlohmann::json;
21+
using json = nlohmann::ordered_json;
2222

2323
namespace {
2424

@@ -543,7 +543,76 @@ int main(int argc, char ** argv) {
543543
{"Access-Control-Allow-Origin", "*"},
544544
{"Access-Control-Allow-Headers", "content-type"}});
545545

546-
std::string const default_content = "<html>hello</html>";
546+
std::string const default_content = R"(
547+
<html>
548+
<head>
549+
<title>Whisper.cpp Server</title>
550+
<meta charset="utf-8">
551+
<meta name="viewport" content="width=device-width">
552+
<style>
553+
body {
554+
font-family: sans-serif;
555+
}
556+
form {
557+
display: flex;
558+
flex-direction: column;
559+
align-items: flex-start;
560+
}
561+
label {
562+
margin-bottom: 0.5rem;
563+
}
564+
input, select {
565+
margin-bottom: 1rem;
566+
}
567+
button {
568+
margin-top: 1rem;
569+
}
570+
</style>
571+
</head>
572+
<body>
573+
<h1>Whisper.cpp Server</h1>
574+
575+
<h2>/inference</h2>
576+
<pre>
577+
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
578+
-H "Content-Type: multipart/form-data" \
579+
-F file="@&lt;file-path&gt;" \
580+
-F temperature="0.0" \
581+
-F temperature_inc="0.2" \
582+
-F response_format="json"
583+
</pre>
584+
585+
<h2>/load</h2>
586+
<pre>
587+
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
588+
-H "Content-Type: multipart/form-data" \
589+
-F model="&lt;path-to-model-file&gt;"
590+
</pre>
591+
592+
<div>
593+
<h2>Try it out</h2>
594+
<form action="/inference" method="POST" enctype="multipart/form-data">
595+
<label for="file">Choose an audio file:</label>
596+
<input type="file" id="file" name="file" accept="audio/*" required><br>
597+
598+
<label for="temperature">Temperature:</label>
599+
<input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
600+
601+
<label for="response_format">Response Format:</label>
602+
<select id="response_format" name="response_format">
603+
<option value="verbose_json">Verbose JSON</option>
604+
<option value="json">JSON</option>
605+
<option value="text">Text</option>
606+
<option value="srt">SRT</option>
607+
<option value="vtt">VTT</option>
608+
</select><br>
609+
610+
<button type="submit">Submit</button>
611+
</form>
612+
</div>
613+
</body>
614+
</html>
615+
)";
547616

548617
// store default params so we can reset after each inference request
549618
whisper_params default_params = params;
@@ -556,15 +625,14 @@ int main(int argc, char ** argv) {
556625

557626
svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
558627
// acquire whisper model mutex lock
559-
whisper_mutex.lock();
628+
std::lock_guard<std::mutex> lock(whisper_mutex);
560629

561630
// first check user requested fields of the request
562631
if (!req.has_file("file"))
563632
{
564633
fprintf(stderr, "error: no 'file' field in the request\n");
565634
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
566635
res.set_content(error_resp, "application/json");
567-
whisper_mutex.unlock();
568636
return;
569637
}
570638
auto audio_file = req.get_file_value("file");
@@ -579,35 +647,42 @@ int main(int argc, char ** argv) {
579647
std::vector<float> pcmf32; // mono-channel F32 PCM
580648
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
581649

582-
// write to temporary file
583-
const std::string temp_filename = "whisper_server_temp_file.wav";
584-
std::ofstream temp_file{temp_filename, std::ios::binary};
585-
temp_file << audio_file.content;
586-
temp_file.close();
587-
588-
// if file is not wav, convert to wav
589-
590650
if (sparams.ffmpeg_converter) {
651+
// if file is not wav, convert to wav
652+
// write to temporary file
653+
const std::string temp_filename = "whisper_server_temp_file.wav";
654+
std::ofstream temp_file{temp_filename, std::ios::binary};
655+
temp_file << audio_file.content;
656+
temp_file.close();
657+
591658
std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
592659
const bool is_converted = convert_to_wav(temp_filename, error_resp);
593660
if (!is_converted) {
594661
res.set_content(error_resp, "application/json");
595-
whisper_mutex.unlock();
596662
return;
597663
}
598-
}
599664

600-
// read wav content into pcmf32
601-
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
602-
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
603-
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
604-
res.set_content(error_resp, "application/json");
665+
// read wav content into pcmf32
666+
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
667+
{
668+
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
669+
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
670+
res.set_content(error_resp, "application/json");
671+
std::remove(temp_filename.c_str());
672+
return;
673+
}
674+
// remove temp file
605675
std::remove(temp_filename.c_str());
606-
whisper_mutex.unlock();
607-
return;
676+
} else {
677+
if (!::read_wav(audio_file.content, pcmf32, pcmf32s, params.diarize))
678+
{
679+
fprintf(stderr, "error: failed to read WAV file\n");
680+
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
681+
res.set_content(error_resp, "application/json");
682+
return;
683+
}
608684
}
609-
// remove temp file
610-
std::remove(temp_filename.c_str());
685+
611686

612687
printf("Successfully loaded %s\n", filename.c_str());
613688

@@ -681,6 +756,7 @@ int main(int argc, char ** argv) {
681756
wparams.logprob_thold = params.logprob_thold;
682757

683758
wparams.no_timestamps = params.no_timestamps;
759+
wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
684760

685761
whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
686762

@@ -724,7 +800,6 @@ int main(int argc, char ** argv) {
724800
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
725801
const std::string error_resp = "{\"error\":\"failed to process audio\"}";
726802
res.set_content(error_resp, "application/json");
727-
whisper_mutex.unlock();
728803
return;
729804
}
730805
}
@@ -778,6 +853,59 @@ int main(int argc, char ** argv) {
778853
ss << speaker << text << "\n\n";
779854
}
780855
res.set_content(ss.str(), "text/vtt");
856+
} else if (params.response_format == vjson_format) {
857+
/* try to match openai/whisper's Python format */
858+
std::string results = output_str(ctx, params, pcmf32s);
859+
json jres = json{
860+
{"task", params.translate ? "translate" : "transcribe"},
861+
{"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
862+
{"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
863+
{"text", results},
864+
{"segments", json::array()}
865+
};
866+
const int n_segments = whisper_full_n_segments(ctx);
867+
for (int i = 0; i < n_segments; ++i)
868+
{
869+
json segment = json{
870+
{"id", i},
871+
{"text", whisper_full_get_segment_text(ctx, i)},
872+
};
873+
874+
if (!params.no_timestamps) {
875+
segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
876+
segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
877+
}
878+
879+
float total_logprob = 0;
880+
const int n_tokens = whisper_full_n_tokens(ctx, i);
881+
for (int j = 0; j < n_tokens; ++j) {
882+
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
883+
if (token.id >= whisper_token_eot(ctx)) {
884+
continue;
885+
}
886+
887+
segment["tokens"].push_back(token.id);
888+
json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
889+
if (!params.no_timestamps) {
890+
word["start"] = token.t0 * 0.01;
891+
word["end"] = token.t1 * 0.01;
892+
}
893+
word["probability"] = token.p;
894+
total_logprob += token.plog;
895+
segment["words"].push_back(word);
896+
}
897+
898+
segment["temperature"] = params.temperature;
899+
segment["avg_logprob"] = total_logprob / n_tokens;
900+
901+
// TODO compression_ratio and no_speech_prob are not implemented yet
902+
// segment["compression_ratio"] = 0;
903+
// segment["no_speech_prob"] = 0;
904+
905+
jres["segments"].push_back(segment);
906+
}
907+
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
908+
"application/json");
781909
}
782910
// TODO add more output formats
783911
else
@@ -792,18 +920,14 @@ int main(int argc, char ** argv) {
792920

793921
// reset params to thier defaults
794922
params = default_params;
795-
796-
// return whisper model mutex lock
797-
whisper_mutex.unlock();
798923
});
799924
svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
800-
whisper_mutex.lock();
925+
std::lock_guard<std::mutex> lock(whisper_mutex);
801926
if (!req.has_file("model"))
802927
{
803928
fprintf(stderr, "error: no 'model' field in the request\n");
804929
const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
805930
res.set_content(error_resp, "application/json");
806-
whisper_mutex.unlock();
807931
return;
808932
}
809933
std::string model = req.get_file_value("model").content;
@@ -812,7 +936,6 @@ int main(int argc, char ** argv) {
812936
fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
813937
const std::string error_resp = "{\"error\":\"model not found!\"}";
814938
res.set_content(error_resp, "application/json");
815-
whisper_mutex.unlock();
816939
return;
817940
}
818941

@@ -835,7 +958,6 @@ int main(int argc, char ** argv) {
835958
res.set_content(success, "application/text");
836959

837960
// check if the model is in the file system
838-
whisper_mutex.unlock();
839961
});
840962

841963
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {

0 commit comments

Comments
 (0)