File tree 3 files changed +5
-3
lines changed
3 files changed +5
-3
lines changed Original file line number Diff line number Diff line change @@ -126,7 +126,7 @@ audio = whisper.load_audio("audio.mp3")
126
126
audio = whisper.pad_or_trim(audio)
127
127
128
128
# make log-Mel spectrogram and move to the same device as the model
129
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
129
+ mel = whisper.log_mel_spectrogram(audio, n_mels = model.dims.n_mels ).to(model.device)
130
130
131
131
# detect the spoken language
132
132
_, probs = model.detect_language(mel)
Original file line number Diff line number Diff line change @@ -122,7 +122,7 @@ def log_mel_spectrogram(
122
122
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
123
123
124
124
n_mels: int
125
- The number of Mel-frequency filters, only 80 is supported
125
+ The number of Mel-frequency filters, only 80 and 128 are supported
126
126
127
127
padding: int
128
128
Number of zero samples to pad to the right
@@ -132,7 +132,7 @@ def log_mel_spectrogram(
132
132
133
133
Returns
134
134
-------
135
- torch.Tensor, shape = (80 , n_frames)
135
+ torch.Tensor, shape = (n_mels , n_frames)
136
136
A Tensor that contains the Mel spectrogram
137
137
"""
138
138
if not torch .is_tensor (audio ):
Original file line number Diff line number Diff line change @@ -214,6 +214,8 @@ def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
214
214
if (
215
215
no_speech_threshold is not None
216
216
and decode_result .no_speech_prob > no_speech_threshold
217
+ and logprob_threshold is not None
218
+ and decode_result .avg_logprob < logprob_threshold
217
219
):
218
220
needs_fallback = False # silence
219
221
if not needs_fallback :
You can’t perform that action at this time.
0 commit comments