whisper.cpp

Running

App Files Files Community

ggerganov commited on Oct 7, 2022

Commit

606a6bc

unverified ·

1 Parent(s): 03f79ff

ref #10 : option to keep context in "stream" example

Browse files

Seems the results become worse when we keep the context, so by default
this is not enabled

Files changed (3) hide show

stream.cpp +5 -0
whisper.cpp +12 -7
whisper.h +1 -0

stream.cpp CHANGED Viewed

@@ -40,6 +40,7 @@ struct whisper_params {
     bool verbose              = false;
     bool translate            = false;
     bool print_special_tokens = false;
     bool no_timestamps        = true;
@@ -64,6 +65,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
             params.verbose = true;
         } else if (arg == "--translate") {
             params.translate = true;
         } else if (arg == "-l" || arg == "--language") {
             params.language = argv[++i];
             if (whisper_lang_id(params.language.c_str()) == -1) {
@@ -103,6 +106,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
     fprintf(stderr, "            --step N         audio step size in milliseconds (default: %d)\n", params.step_ms);
     fprintf(stderr, "  -v,       --verbose        verbose output\n");
     fprintf(stderr, "            --translate      translate from source language to english\n");
     fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
     fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
     fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
@@ -273,6 +277,7 @@ int main(int argc, char ** argv) {
             wparams.print_realtime       = false;
             wparams.print_timestamps     = !params.no_timestamps;
             wparams.translate            = params.translate;
             wparams.language             = params.language.c_str();
             wparams.n_threads            = params.n_threads;

     bool verbose              = false;
     bool translate            = false;
+    bool no_context           = true;
     bool print_special_tokens = false;
     bool no_timestamps        = true;
             params.verbose = true;
         } else if (arg == "--translate") {
             params.translate = true;
+        } else if (arg == "-kc" || arg == "--keep-context") {
+            params.no_context = false;
         } else if (arg == "-l" || arg == "--language") {
             params.language = argv[++i];
             if (whisper_lang_id(params.language.c_str()) == -1) {
     fprintf(stderr, "            --step N         audio step size in milliseconds (default: %d)\n", params.step_ms);
     fprintf(stderr, "  -v,       --verbose        verbose output\n");
     fprintf(stderr, "            --translate      translate from source language to english\n");
+    fprintf(stderr, "  -nc,      --no-context     disable context from earlier audio (default: false)\n");
     fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
     fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
     fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
             wparams.print_realtime       = false;
             wparams.print_timestamps     = !params.no_timestamps;
             wparams.translate            = params.translate;
+            wparams.no_context           = params.no_context;
             wparams.language             = params.language.c_str();
             wparams.n_threads            = params.n_threads;

whisper.cpp CHANGED Viewed

@@ -405,6 +405,8 @@ struct whisper_context {
     std::vector<whisper_result>  result_cur;
     std::vector<whisper_segment> result_all;
 };
 // load the model from a ggml file
@@ -1020,8 +1022,6 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
 //   - model:      the model
 //   - n_threads:  number of threads to use
 //   - mel_offset: offset in the mel spectrogram (i.e. audio offset)
-//   - mel_inp:    input mel spectrogram
-//   - features:   output encoded features
 //
 bool whisper_encode(
               whisper_context & wctx,
@@ -1405,10 +1405,9 @@ bool whisper_encode(
 //
 //   - model:      the model
 //   - n_threads:  number of threads to use
-//   - n_past:     prompt length
-//   - prompt:     text prompt
-//   - logits_out: output logits
-//   - probs_out:  output probabilities
 //
 bool whisper_decode(
               whisper_context & wctx,
@@ -2259,6 +2258,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_decode_strat
                     .offset_ms = 0,
                     .translate            = false,
                     .print_special_tokens = false,
                     .print_progress       = true,
                     .print_realtime       = false,
@@ -2279,6 +2279,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_decode_strat
                     .offset_ms = 0,
                     .translate            = false,
                     .print_special_tokens = false,
                     .print_progress       = true,
                     .print_realtime       = false,
@@ -2297,6 +2298,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_decode_strat
     return result;
 }
 int whisper_full(
         struct whisper_context * ctx,
         struct whisper_full_params params,
@@ -2309,7 +2311,10 @@ int whisper_full(
     }
     // the accumulated text context so far
-    std::vector<whisper_token> prompt_past = { };
     // these tokens determine the task that will be performed
     std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };

     std::vector<whisper_result>  result_cur;
     std::vector<whisper_segment> result_all;
+    std::vector<whisper_token> prompt_past;
 };
 // load the model from a ggml file
 //   - model:      the model
 //   - n_threads:  number of threads to use
 //   - mel_offset: offset in the mel spectrogram (i.e. audio offset)
 //
 bool whisper_encode(
               whisper_context & wctx,
 //
 //   - model:      the model
 //   - n_threads:  number of threads to use
+//   - tokens:     text prompt
+//   - n_tokens:   number of tokens in the prompt
+//   - n_past:     number of past tokens to prefix the prompt with
 //
 bool whisper_decode(
               whisper_context & wctx,
                     .offset_ms = 0,
                     .translate            = false,
+                    .no_context           = false,
                     .print_special_tokens = false,
                     .print_progress       = true,
                     .print_realtime       = false,
                     .offset_ms = 0,
                     .translate            = false,
+                    .no_context           = false,
                     .print_special_tokens = false,
                     .print_progress       = true,
                     .print_realtime       = false,
     return result;
 }
 int whisper_full(
         struct whisper_context * ctx,
         struct whisper_full_params params,
     }
     // the accumulated text context so far
+    auto & prompt_past = ctx->prompt_past;
+    if (params.no_context) {
+        prompt_past.clear();
+    }
     // these tokens determine the task that will be performed
     std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };

whisper.h CHANGED Viewed

@@ -105,6 +105,7 @@ extern "C" {
         int offset_ms;
         bool translate;
         bool print_special_tokens;
         bool print_progress;
         bool print_realtime;

         int offset_ms;
         bool translate;
+        bool no_context;
         bool print_special_tokens;
         bool print_progress;
         bool print_realtime;