|
| 1 | +whispercpp |
| 2 | +========== |
| 3 | + |
| 4 | + |
| 5 | + |
| 6 | +Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model. |
| 7 | + |
| 8 | +Installation |
| 9 | +------------ |
| 10 | + |
| 11 | +Install the gem and add to the application's Gemfile by executing: |
| 12 | + |
| 13 | + $ bundle add whispercpp |
| 14 | + |
| 15 | +If bundler is not being used to manage dependencies, install the gem by executing: |
| 16 | + |
| 17 | + $ gem install whispercpp |
| 18 | + |
| 19 | +Usage |
| 20 | +----- |
| 21 | + |
| 22 | +```ruby |
| 23 | +require "whisper" |
| 24 | + |
| 25 | +whisper = Whisper::Context.new("path/to/model.bin") |
| 26 | + |
| 27 | +params = Whisper::Params.new |
| 28 | +params.language = "en" |
| 29 | +params.offset = 10_000 |
| 30 | +params.duration = 60_000 |
| 31 | +params.max_text_tokens = 300 |
| 32 | +params.translate = true |
| 33 | +params.print_timestamps = false |
| 34 | + |
| 35 | +whisper.transcribe("path/to/audio.wav", params) do |whole_text| |
| 36 | + puts whole_text |
| 37 | +end |
| 38 | + |
| 39 | +``` |
| 40 | + |
| 41 | +### Preparing model ### |
| 42 | + |
| 43 | +Use script to download model file(s): |
| 44 | + |
| 45 | +```bash |
| 46 | +git clone https://github.com/ggerganov/whisper.cpp.git |
| 47 | +cd whisper.cpp |
| 48 | +sh ./models/download-ggml-model.sh base.en |
| 49 | +``` |
| 50 | + |
| 51 | +There are some types of models. See [models][] page for details. |
| 52 | + |
| 53 | +### Preparing audio file ### |
| 54 | + |
| 55 | +Currently, whisper.cpp accepts only 16-bit WAV files. |
| 56 | + |
| 57 | +### API ### |
| 58 | + |
| 59 | +Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`: |
| 60 | + |
| 61 | +```ruby |
| 62 | +def format_time(time_ms) |
| 63 | + sec, decimal_part = time_ms.divmod(1000) |
| 64 | + min, sec = sec.divmod(60) |
| 65 | + hour, min = min.divmod(60) |
| 66 | + "%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part] |
| 67 | +end |
| 68 | + |
| 69 | +whisper.transcribe("path/to/audio.wav", params) |
| 70 | + |
| 71 | +whisper.each_segment.with_index do |segment, index| |
| 72 | + line = "[%{nth}: %{st} --> %{ed}] %{text}" % { |
| 73 | + nth: index + 1, |
| 74 | + st: format_time(segment.start_time), |
| 75 | + ed: format_time(segment.end_time), |
| 76 | + text: segment.text |
| 77 | + } |
| 78 | + line << " (speaker turned)" if segment.speaker_next_turn? |
| 79 | + puts line |
| 80 | +end |
| 81 | + |
| 82 | +``` |
| 83 | + |
| 84 | +You can also add hook to params called on new segment: |
| 85 | + |
| 86 | +```ruby |
| 87 | +def format_time(time_ms) |
| 88 | + sec, decimal_part = time_ms.divmod(1000) |
| 89 | + min, sec = sec.divmod(60) |
| 90 | + hour, min = min.divmod(60) |
| 91 | + "%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part] |
| 92 | +end |
| 93 | + |
| 94 | +# Add hook before calling #transcribe |
| 95 | +params.on_new_segment do |segment| |
| 96 | + line = "[%{st} --> %{ed}] %{text}" % { |
| 97 | + st: format_time(segment.start_time), |
| 98 | + ed: format_time(segment.end_time), |
| 99 | + text: segment.text |
| 100 | + } |
| 101 | + line << " (speaker turned)" if segment.speaker_next_turn? |
| 102 | + puts line |
| 103 | +end |
| 104 | + |
| 105 | +whisper.transcribe("path/to/audio.wav", params) |
| 106 | + |
| 107 | +``` |
| 108 | + |
| 109 | +[whisper.cpp]: https://github.com/ggerganov/whisper.cpp |
| 110 | +[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models |
0 commit comments