ggerganov commited on
Commit
d3e767a
·
unverified ·
1 Parent(s): aa64fa0

whisper : reduce delta_min from 1000ms to 100ms (#3028)

Browse files
Files changed (1) hide show
  1. src/whisper.cpp +11 -9
src/whisper.cpp CHANGED
@@ -5527,11 +5527,13 @@ int whisper_full_with_state(
5527
  const int seek_start = params.offset_ms/10;
5528
  const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
5529
 
5530
- // if length of spectrogram is less than 1.0s (100 frames), then return
5531
- // basically don't process anything that is less than 1.0s
5532
- // see issue #39: https://github.com/ggml-org/whisper.cpp/issues/39
5533
- if (seek_end < seek_start + 100) {
5534
- WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
 
 
5535
  return 0;
5536
  }
5537
 
@@ -5675,8 +5677,8 @@ int whisper_full_with_state(
5675
  ctx, state, progress_cur, params.progress_callback_user_data);
5676
  }
5677
 
5678
- // if only 1 second left, then stop
5679
- if (seek + 100 >= seek_end) {
5680
  break;
5681
  }
5682
 
@@ -6023,10 +6025,10 @@ int whisper_full_with_state(
6023
  // end of segment
6024
  if (token.id == whisper_token_eot(ctx) || // end of text token
6025
  (params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
6026
- (has_ts && seek + seek_delta + 100 >= seek_end) // end of audio reached
6027
  ) {
6028
  if (result_len == 0 && !params.no_timestamps) {
6029
- if (seek + seek_delta + 100 >= seek_end) {
6030
  result_len = i + 1;
6031
  } else {
6032
  WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
 
5527
  const int seek_start = params.offset_ms/10;
5528
  const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
5529
 
5530
+ // if length of spectrogram is less than 100ms (10 frames), then return
5531
+ // basically don't process anything that is less than 100ms
5532
+ // ref: https://github.com/ggml-org/whisper.cpp/issues/2065
5533
+ const int delta_min = 10;
5534
+
5535
+ if (seek_end < seek_start + delta_min) {
5536
+ WHISPER_LOG_WARN("%s: input is too short - %d ms < 100 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
5537
  return 0;
5538
  }
5539
 
 
5677
  ctx, state, progress_cur, params.progress_callback_user_data);
5678
  }
5679
 
5680
+ // if only 100ms left, then stop
5681
+ if (seek + delta_min >= seek_end) {
5682
  break;
5683
  }
5684
 
 
6025
  // end of segment
6026
  if (token.id == whisper_token_eot(ctx) || // end of text token
6027
  (params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
6028
+ (has_ts && seek + seek_delta + delta_min >= seek_end) // end of audio reached (100ms)
6029
  ) {
6030
  if (result_len == 0 && !params.no_timestamps) {
6031
+ if (seek + seek_delta + delta_min >= seek_end) {
6032
  result_len = i + 1;
6033
  } else {
6034
  WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);