3
3
import itertools
4
4
import logging
5
5
import numpy as np
6
- import pyaudio
7
- import torch
6
+
8
7
import queue
9
8
import threading
10
9
import time
29
28
from dailyai .services .ai_services import TTSService
30
29
from dailyai .transports .abstract_transport import AbstractTransport
31
30
32
- torch .set_num_threads (1 )
33
-
34
- model , utils = torch .hub .load (
35
- repo_or_dir = "snakers4/silero-vad" , model = "silero_vad" , force_reload = False
36
- )
37
-
38
- (get_speech_timestamps , save_audio , read_audio , VADIterator , collect_chunks ) = utils
39
-
40
- # Taken from utils_vad.py
41
-
42
-
43
- def validate (model , inputs : torch .Tensor ):
44
- with torch .no_grad ():
45
- outs = model (inputs )
46
- return outs
47
-
48
31
49
32
# Provided by Alexander Veysov
50
33
@@ -58,12 +41,7 @@ def int2float(sound):
58
41
return sound
59
42
60
43
61
- FORMAT = pyaudio .paInt16
62
- CHANNELS = 1
63
44
SAMPLE_RATE = 16000
64
- CHUNK = int (SAMPLE_RATE / 10 )
65
-
66
- audio = pyaudio .PyAudio ()
67
45
68
46
69
47
class VADState (Enum ):
@@ -90,6 +68,24 @@ def __init__(
90
68
"Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False."
91
69
)
92
70
71
+ if self ._vad_enabled :
72
+ try :
73
+ global torch , torchaudio
74
+ import torch
75
+ # We don't use torchaudio here, but we need to try importing it because
76
+ # Silero uses it
77
+ import torchaudio
78
+ torch .set_num_threads (1 )
79
+
80
+ (self .model , self .utils ) = torch .hub .load (
81
+ repo_or_dir = "snakers4/silero-vad" , model = "silero_vad" , force_reload = False
82
+ )
83
+
84
+ except ModuleNotFoundError as e :
85
+ print (f"Exception: { e } " )
86
+ print ("In order to use VAD, you'll need to install the `torch` and `torchaudio` modules." )
87
+ raise Exception (f"Missing module(s): { e } " )
88
+
93
89
self ._vad_samples = 1536
94
90
vad_frame_s = self ._vad_samples / SAMPLE_RATE
95
91
self ._vad_start_frames = round (self ._vad_start_s / vad_frame_s )
@@ -276,7 +272,7 @@ def _vad(self):
276
272
audio_chunk = self .read_audio_frames (self ._vad_samples )
277
273
audio_int16 = np .frombuffer (audio_chunk , np .int16 )
278
274
audio_float32 = int2float (audio_int16 )
279
- new_confidence = model (
275
+ new_confidence = self . model (
280
276
torch .from_numpy (audio_float32 ), 16000 ).item ()
281
277
speaking = new_confidence > 0.5
282
278
0 commit comments