Skip to content

Commit f90c89a

Browse files
authored
Merge branch 'main' into setup.py-to-pyproject.toml
2 parents fcb7755 + 90db0de commit f90c89a

File tree

3 files changed

+5
-3
lines changed

3 files changed

+5
-3
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ audio = whisper.load_audio("audio.mp3")
126126
audio = whisper.pad_or_trim(audio)
127127

128128
# make log-Mel spectrogram and move to the same device as the model
129-
mel = whisper.log_mel_spectrogram(audio).to(model.device)
129+
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
130130

131131
# detect the spoken language
132132
_, probs = model.detect_language(mel)

whisper/audio.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def log_mel_spectrogram(
122122
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
123123
124124
n_mels: int
125-
The number of Mel-frequency filters, only 80 is supported
125+
The number of Mel-frequency filters, only 80 and 128 are supported
126126
127127
padding: int
128128
Number of zero samples to pad to the right
@@ -132,7 +132,7 @@ def log_mel_spectrogram(
132132
133133
Returns
134134
-------
135-
torch.Tensor, shape = (80, n_frames)
135+
torch.Tensor, shape = (n_mels, n_frames)
136136
A Tensor that contains the Mel spectrogram
137137
"""
138138
if not torch.is_tensor(audio):

whisper/transcribe.py

+2
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,8 @@ def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
214214
if (
215215
no_speech_threshold is not None
216216
and decode_result.no_speech_prob > no_speech_threshold
217+
and logprob_threshold is not None
218+
and decode_result.avg_logprob < logprob_threshold
217219
):
218220
needs_fallback = False # silence
219221
if not needs_fallback:

0 commit comments

Comments
 (0)