Spaces:
Running
Running
whisper : reduce delta_min from 1000ms to 100ms (#3028)
Browse files- src/whisper.cpp +11 -9
src/whisper.cpp
CHANGED
|
@@ -5527,11 +5527,13 @@ int whisper_full_with_state(
|
|
| 5527 |
const int seek_start = params.offset_ms/10;
|
| 5528 |
const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
|
| 5529 |
|
| 5530 |
-
// if length of spectrogram is less than
|
| 5531 |
-
// basically don't process anything that is less than
|
| 5532 |
-
//
|
| 5533 |
-
|
| 5534 |
-
|
|
|
|
|
|
|
| 5535 |
return 0;
|
| 5536 |
}
|
| 5537 |
|
|
@@ -5675,8 +5677,8 @@ int whisper_full_with_state(
|
|
| 5675 |
ctx, state, progress_cur, params.progress_callback_user_data);
|
| 5676 |
}
|
| 5677 |
|
| 5678 |
-
// if only
|
| 5679 |
-
if (seek +
|
| 5680 |
break;
|
| 5681 |
}
|
| 5682 |
|
|
@@ -6023,10 +6025,10 @@ int whisper_full_with_state(
|
|
| 6023 |
// end of segment
|
| 6024 |
if (token.id == whisper_token_eot(ctx) || // end of text token
|
| 6025 |
(params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
|
| 6026 |
-
(has_ts && seek + seek_delta +
|
| 6027 |
) {
|
| 6028 |
if (result_len == 0 && !params.no_timestamps) {
|
| 6029 |
-
if (seek + seek_delta +
|
| 6030 |
result_len = i + 1;
|
| 6031 |
} else {
|
| 6032 |
WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
|
|
|
|
| 5527 |
const int seek_start = params.offset_ms/10;
|
| 5528 |
const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
|
| 5529 |
|
| 5530 |
+
// if length of spectrogram is less than 100ms (10 frames), then return
|
| 5531 |
+
// basically don't process anything that is less than 100ms
|
| 5532 |
+
// ref: https://github.com/ggml-org/whisper.cpp/issues/2065
|
| 5533 |
+
const int delta_min = 10;
|
| 5534 |
+
|
| 5535 |
+
if (seek_end < seek_start + delta_min) {
|
| 5536 |
+
WHISPER_LOG_WARN("%s: input is too short - %d ms < 100 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
|
| 5537 |
return 0;
|
| 5538 |
}
|
| 5539 |
|
|
|
|
| 5677 |
ctx, state, progress_cur, params.progress_callback_user_data);
|
| 5678 |
}
|
| 5679 |
|
| 5680 |
+
// if only 100ms left, then stop
|
| 5681 |
+
if (seek + delta_min >= seek_end) {
|
| 5682 |
break;
|
| 5683 |
}
|
| 5684 |
|
|
|
|
| 6025 |
// end of segment
|
| 6026 |
if (token.id == whisper_token_eot(ctx) || // end of text token
|
| 6027 |
(params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
|
| 6028 |
+
(has_ts && seek + seek_delta + delta_min >= seek_end) // end of audio reached (100ms)
|
| 6029 |
) {
|
| 6030 |
if (result_len == 0 && !params.no_timestamps) {
|
| 6031 |
+
if (seek + seek_delta + delta_min >= seek_end) {
|
| 6032 |
result_len = i + 1;
|
| 6033 |
} else {
|
| 6034 |
WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
|