Spaces:
Running
Running
Aleksander Andrzejewski
commited on
Server : Add support for .vtt format to Whisper server (#1578)
Browse files- The code comes from examples/main
- The output mimetype is set to text/vtt
Example usage:
```shell
curl 127.0.0.1:8080/inference \
-H "Content-Type: multipart/form-data" \
-F file="@samples/jfk.wav" \
-F temperature="0.2" \
-F response-format="vtt"
```
- examples/server/server.cpp +23 -0
examples/server/server.cpp
CHANGED
|
@@ -678,6 +678,29 @@ int main(int argc, char ** argv) {
|
|
| 678 |
ss << speaker << text << "\n\n";
|
| 679 |
}
|
| 680 |
res.set_content(ss.str(), "application/x-subrip");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 681 |
}
|
| 682 |
// TODO add more output formats
|
| 683 |
else
|
|
|
|
| 678 |
ss << speaker << text << "\n\n";
|
| 679 |
}
|
| 680 |
res.set_content(ss.str(), "application/x-subrip");
|
| 681 |
+
} else if (params.response_format == vtt_format) {
|
| 682 |
+
std::stringstream ss;
|
| 683 |
+
|
| 684 |
+
ss << "WEBVTT\n\n";
|
| 685 |
+
|
| 686 |
+
const int n_segments = whisper_full_n_segments(ctx);
|
| 687 |
+
for (int i = 0; i < n_segments; ++i) {
|
| 688 |
+
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 689 |
+
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 690 |
+
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 691 |
+
std::string speaker = "";
|
| 692 |
+
|
| 693 |
+
if (params.diarize && pcmf32s.size() == 2)
|
| 694 |
+
{
|
| 695 |
+
speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
|
| 696 |
+
speaker.insert(0, "<v Speaker");
|
| 697 |
+
speaker.append(">");
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
|
| 701 |
+
ss << speaker << text << "\n\n";
|
| 702 |
+
}
|
| 703 |
+
res.set_content(ss.str(), "text/vtt");
|
| 704 |
}
|
| 705 |
// TODO add more output formats
|
| 706 |
else
|