Aleksander Andrzejewski commited on
Commit
17f1f89
·
unverified ·
1 Parent(s): 77aa181

Server : Add support for .vtt format to Whisper server (#1578)

Browse files

- The code comes from examples/main
- The output mimetype is set to text/vtt

Example usage:
```shell
curl 127.0.0.1:8080/inference \
-H "Content-Type: multipart/form-data" \
-F file="@samples/jfk.wav" \
-F temperature="0.2" \
-F response-format="vtt"
```

Files changed (1) hide show
  1. examples/server/server.cpp +23 -0
examples/server/server.cpp CHANGED
@@ -678,6 +678,29 @@ int main(int argc, char ** argv) {
678
  ss << speaker << text << "\n\n";
679
  }
680
  res.set_content(ss.str(), "application/x-subrip");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
  }
682
  // TODO add more output formats
683
  else
 
678
  ss << speaker << text << "\n\n";
679
  }
680
  res.set_content(ss.str(), "application/x-subrip");
681
+ } else if (params.response_format == vtt_format) {
682
+ std::stringstream ss;
683
+
684
+ ss << "WEBVTT\n\n";
685
+
686
+ const int n_segments = whisper_full_n_segments(ctx);
687
+ for (int i = 0; i < n_segments; ++i) {
688
+ const char * text = whisper_full_get_segment_text(ctx, i);
689
+ const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
690
+ const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
691
+ std::string speaker = "";
692
+
693
+ if (params.diarize && pcmf32s.size() == 2)
694
+ {
695
+ speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
696
+ speaker.insert(0, "<v Speaker");
697
+ speaker.append(">");
698
+ }
699
+
700
+ ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
701
+ ss << speaker << text << "\n\n";
702
+ }
703
+ res.set_content(ss.str(), "text/vtt");
704
  }
705
  // TODO add more output formats
706
  else