@@ -59,8 +59,12 @@ struct whisper_params {
59
59
int32_t duration_ms = 0 ;
60
60
int32_t max_context = -1 ;
61
61
int32_t max_len = 0 ;
62
+ int32_t best_of = 5 ;
63
+ int32_t beam_size = -1 ;
62
64
63
- float word_thold = 0 .01f ;
65
+ float word_thold = 0 .01f ;
66
+ float entropy_thold = 2 .4f ;
67
+ float logprob_thold = -1 .0f ;
64
68
65
69
bool speed_up = false ;
66
70
bool translate = false ;
@@ -104,7 +108,11 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
104
108
else if (arg == " -d" || arg == " --duration" ) { params.duration_ms = std::stoi (argv[++i]); }
105
109
else if (arg == " -mc" || arg == " --max-context" ) { params.max_context = std::stoi (argv[++i]); }
106
110
else if (arg == " -ml" || arg == " --max-len" ) { params.max_len = std::stoi (argv[++i]); }
111
+ else if (arg == " -bo" || arg == " --best-of" ) { params.best_of = std::stoi (argv[++i]); }
112
+ else if (arg == " -bs" || arg == " --beam-size" ) { params.beam_size = std::stoi (argv[++i]); }
107
113
else if (arg == " -wt" || arg == " --word-thold" ) { params.word_thold = std::stof (argv[++i]); }
114
+ else if (arg == " -et" || arg == " --entropy-thold" ) { params.entropy_thold = std::stof (argv[++i]); }
115
+ else if (arg == " -lpt" || arg == " --logprob-thold" ) { params.logprob_thold = std::stof (argv[++i]); }
108
116
else if (arg == " -su" || arg == " --speed-up" ) { params.speed_up = true ; }
109
117
else if (arg == " -tr" || arg == " --translate" ) { params.translate = true ; }
110
118
else if (arg == " -di" || arg == " --diarize" ) { params.diarize = true ; }
@@ -136,31 +144,35 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
136
144
fprintf (stderr, " usage: %s [options] file0.wav file1.wav ...\n " , argv[0 ]);
137
145
fprintf (stderr, " \n " );
138
146
fprintf (stderr, " options:\n " );
139
- fprintf (stderr, " -h, --help [default] show this help message and exit\n " );
140
- fprintf (stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n " , params.n_threads );
141
- fprintf (stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n " , params.n_processors );
142
- fprintf (stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n " , params.offset_t_ms );
143
- fprintf (stderr, " -on N, --offset-n N [%-7d] segment index offset\n " , params.offset_n );
144
- fprintf (stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n " , params.duration_ms );
145
- fprintf (stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n " , params.max_context );
146
- fprintf (stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n " , params.max_len );
147
- fprintf (stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n " , params.word_thold );
148
- fprintf (stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n " , params.speed_up ? " true" : " false" );
149
- fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " , params.translate ? " true" : " false" );
150
- fprintf (stderr, " -di, --diarize [%-7s] stereo audio diarization\n " , params.diarize ? " true" : " false" );
151
- fprintf (stderr, " -otxt, --output-txt [%-7s] output result in a text file\n " , params.output_txt ? " true" : " false" );
152
- fprintf (stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n " , params.output_vtt ? " true" : " false" );
153
- fprintf (stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n " , params.output_srt ? " true" : " false" );
154
- fprintf (stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n " , params.output_wts ? " true" : " false" );
155
- fprintf (stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n " , params.output_csv ? " true" : " false" );
156
- fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
157
- fprintf (stderr, " -pc, --print-colors [%-7s] print colors\n " , params.print_colors ? " true" : " false" );
158
- fprintf (stderr, " -pp, --print-progress [%-7s] print progress\n " , params.print_progress ? " true" : " false" );
159
- fprintf (stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n " , params.no_timestamps ? " false" : " true" );
160
- fprintf (stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n " , params.language .c_str ());
161
- fprintf (stderr, " --prompt PROMPT [%-7s] initial prompt\n " , params.prompt .c_str ());
162
- fprintf (stderr, " -m FNAME, --model FNAME [%-7s] model path\n " , params.model .c_str ());
163
- fprintf (stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n " , " " );
147
+ fprintf (stderr, " -h, --help [default] show this help message and exit\n " );
148
+ fprintf (stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n " , params.n_threads );
149
+ fprintf (stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n " , params.n_processors );
150
+ fprintf (stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n " , params.offset_t_ms );
151
+ fprintf (stderr, " -on N, --offset-n N [%-7d] segment index offset\n " , params.offset_n );
152
+ fprintf (stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n " , params.duration_ms );
153
+ fprintf (stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n " , params.max_context );
154
+ fprintf (stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n " , params.max_len );
155
+ fprintf (stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n " , params.best_of );
156
+ fprintf (stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n " , params.beam_size );
157
+ fprintf (stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n " , params.word_thold );
158
+ fprintf (stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n " , params.entropy_thold );
159
+ fprintf (stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n " , params.logprob_thold );
160
+ fprintf (stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n " , params.speed_up ? " true" : " false" );
161
+ fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " , params.translate ? " true" : " false" );
162
+ fprintf (stderr, " -di, --diarize [%-7s] stereo audio diarization\n " , params.diarize ? " true" : " false" );
163
+ fprintf (stderr, " -otxt, --output-txt [%-7s] output result in a text file\n " , params.output_txt ? " true" : " false" );
164
+ fprintf (stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n " , params.output_vtt ? " true" : " false" );
165
+ fprintf (stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n " , params.output_srt ? " true" : " false" );
166
+ fprintf (stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n " , params.output_wts ? " true" : " false" );
167
+ fprintf (stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n " , params.output_csv ? " true" : " false" );
168
+ fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
169
+ fprintf (stderr, " -pc, --print-colors [%-7s] print colors\n " , params.print_colors ? " true" : " false" );
170
+ fprintf (stderr, " -pp, --print-progress [%-7s] print progress\n " , params.print_progress ? " true" : " false" );
171
+ fprintf (stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n " , params.no_timestamps ? " false" : " true" );
172
+ fprintf (stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n " , params.language .c_str ());
173
+ fprintf (stderr, " --prompt PROMPT [%-7s] initial prompt\n " , params.prompt .c_str ());
174
+ fprintf (stderr, " -m FNAME, --model FNAME [%-7s] model path\n " , params.model .c_str ());
175
+ fprintf (stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n " , " " );
164
176
fprintf (stderr, " \n " );
165
177
}
166
178
@@ -235,7 +247,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
235
247
const char * text = whisper_full_get_token_text (ctx, i, j);
236
248
const float p = whisper_full_get_token_p (ctx, i, j);
237
249
238
- const int col = std::max (0 , std::min ((int ) k_colors.size (), (int ) (std::pow (p, 3 )*float (k_colors.size ()))));
250
+ const int col = std::max (0 , std::min ((int ) k_colors.size () - 1 , (int ) (std::pow (p, 3 )*float (k_colors.size ()))));
239
251
240
252
printf (" %s%s%s%s" , speaker.c_str (), k_colors[col].c_str (), text, " \033 [0m" );
241
253
}
@@ -331,20 +343,19 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
331
343
const int n_segments = whisper_full_n_segments (ctx);
332
344
for (int i = 0 ; i < n_segments; ++i) {
333
345
const char * text = whisper_full_get_segment_text (ctx, i);
334
- if (text[0 ] == ' ' )
335
- text = text + sizeof (char ); // whisper_full_get_segment_text() returns a string with leading space, point to the next character.
346
+ if (text[0 ] == ' ' ) {
347
+ text = text + sizeof (char ); // whisper_full_get_segment_text() returns a string with leading space, point to the next character.
348
+ }
336
349
const int64_t t0 = whisper_full_get_segment_t0 (ctx, i);
337
350
const int64_t t1 = whisper_full_get_segment_t1 (ctx, i);
338
- // need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
339
- fout << 10 * t0 << " , "
340
- << 10 * t1 << " , \" "
341
- << text << " \"\n " ;
351
+
352
+ // need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
353
+ fout << 10 * t0 << " , " << 10 * t1 << " , \" " << text << " \"\n " ;
342
354
}
343
355
344
356
return true ;
345
357
}
346
358
347
-
348
359
// karaoke video generation
349
360
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
350
361
// TODO: font parameter adjustments
@@ -620,6 +631,8 @@ int main(int argc, char ** argv) {
620
631
{
621
632
whisper_full_params wparams = whisper_full_default_params (WHISPER_SAMPLING_GREEDY);
622
633
634
+ wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
635
+
623
636
wparams.print_realtime = false ;
624
637
wparams.print_progress = params.print_progress ;
625
638
wparams.print_timestamps = !params.no_timestamps ;
@@ -633,12 +646,18 @@ int main(int argc, char ** argv) {
633
646
634
647
wparams.token_timestamps = params.output_wts || params.max_len > 0 ;
635
648
wparams.thold_pt = params.word_thold ;
649
+ wparams.entropy_thold = params.entropy_thold ;
650
+ wparams.logprob_thold = params.logprob_thold ;
636
651
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len ;
637
652
638
653
wparams.speed_up = params.speed_up ;
639
654
640
- wparams.prompt_tokens = prompt_tokens.empty () ? nullptr : prompt_tokens.data ();
641
- wparams.prompt_n_tokens = prompt_tokens.empty () ? 0 : prompt_tokens.size ();
655
+ wparams.greedy .best_of = params.best_of ;
656
+ wparams.beam_search .beam_size = params.beam_size ;
657
+ wparams.temperature_inc = -1 ;
658
+
659
+ wparams.prompt_tokens = prompt_tokens.empty () ? nullptr : prompt_tokens.data ();
660
+ wparams.prompt_n_tokens = prompt_tokens.empty () ? 0 : prompt_tokens.size ();
642
661
643
662
whisper_print_user_data user_data = { ¶ms, &pcmf32s };
644
663
0 commit comments