Skip to content

Commit 2f88e6a

Browse files
committed
first commit
0 parents  commit 2f88e6a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+12455
-0
lines changed

.gitignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
__pycache__/
2+
build/
3+
*.egg-info/
4+
*.so
5+
*.mp4
6+
7+
tmp*
8+
trial*/
9+
10+
data
11+
data_utils/face_tracking/3DMM/*
12+
data_utils/face_parsing/79999_iter.pth
13+
14+
pretrained
15+
*.mp4

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2022 hawkey
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

assets/main.png

182 KB
Loading
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Routines for DeepSpeech features processing
2+
Several routines for [DeepSpeech](https://github.com/mozilla/DeepSpeech) features processing, like speech features generation for [VOCA](https://github.com/TimoBolkart/voca) model.
3+
4+
## Installation
5+
6+
```
7+
pip3 install -r requirements.txt
8+
```
9+
10+
## Usage
11+
12+
Generate wav files:
13+
```
14+
python3 extract_wav.py --in-video=<you_data_dir>
15+
```
16+
17+
Generate files with DeepSpeech features:
18+
```
19+
python3 extract_ds_features.py --input=<you_data_dir>
20+
```
Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
"""
2+
DeepSpeech features processing routines.
3+
NB: Based on VOCA code. See the corresponding license restrictions.
4+
"""
5+
6+
__all__ = ['conv_audios_to_deepspeech']
7+
8+
import numpy as np
9+
import warnings
10+
import resampy
11+
from scipy.io import wavfile
12+
from python_speech_features import mfcc
13+
import tensorflow.compat.v1 as tf
14+
tf.disable_v2_behavior()
15+
16+
def conv_audios_to_deepspeech(audios,
17+
out_files,
18+
num_frames_info,
19+
deepspeech_pb_path,
20+
audio_window_size=1,
21+
audio_window_stride=1):
22+
"""
23+
Convert list of audio files into files with DeepSpeech features.
24+
25+
Parameters
26+
----------
27+
audios : list of str or list of None
28+
Paths to input audio files.
29+
out_files : list of str
30+
Paths to output files with DeepSpeech features.
31+
num_frames_info : list of int
32+
List of numbers of frames.
33+
deepspeech_pb_path : str
34+
Path to DeepSpeech 0.1.0 frozen model.
35+
audio_window_size : int, default 16
36+
Audio window size.
37+
audio_window_stride : int, default 1
38+
Audio window stride.
39+
"""
40+
# deepspeech_pb_path="/disk4/keyu/DeepSpeech/deepspeech-0.9.2-models.pbmm"
41+
graph, logits_ph, input_node_ph, input_lengths_ph = prepare_deepspeech_net(
42+
deepspeech_pb_path)
43+
44+
with tf.compat.v1.Session(graph=graph) as sess:
45+
for audio_file_path, out_file_path, num_frames in zip(audios, out_files, num_frames_info):
46+
print(audio_file_path)
47+
print(out_file_path)
48+
audio_sample_rate, audio = wavfile.read(audio_file_path)
49+
if audio.ndim != 1:
50+
warnings.warn(
51+
"Audio has multiple channels, the first channel is used")
52+
audio = audio[:, 0]
53+
ds_features = pure_conv_audio_to_deepspeech(
54+
audio=audio,
55+
audio_sample_rate=audio_sample_rate,
56+
audio_window_size=audio_window_size,
57+
audio_window_stride=audio_window_stride,
58+
num_frames=num_frames,
59+
net_fn=lambda x: sess.run(
60+
logits_ph,
61+
feed_dict={
62+
input_node_ph: x[np.newaxis, ...],
63+
input_lengths_ph: [x.shape[0]]}))
64+
65+
net_output = ds_features.reshape(-1, 29)
66+
win_size = 16
67+
zero_pad = np.zeros((int(win_size / 2), net_output.shape[1]))
68+
net_output = np.concatenate(
69+
(zero_pad, net_output, zero_pad), axis=0)
70+
windows = []
71+
for window_index in range(0, net_output.shape[0] - win_size, 2):
72+
windows.append(
73+
net_output[window_index:window_index + win_size])
74+
print(np.array(windows).shape)
75+
np.save(out_file_path, np.array(windows))
76+
77+
78+
def prepare_deepspeech_net(deepspeech_pb_path):
79+
"""
80+
Load and prepare DeepSpeech network.
81+
82+
Parameters
83+
----------
84+
deepspeech_pb_path : str
85+
Path to DeepSpeech 0.1.0 frozen model.
86+
87+
Returns
88+
-------
89+
graph : obj
90+
ThensorFlow graph.
91+
logits_ph : obj
92+
ThensorFlow placeholder for `logits`.
93+
input_node_ph : obj
94+
ThensorFlow placeholder for `input_node`.
95+
input_lengths_ph : obj
96+
ThensorFlow placeholder for `input_lengths`.
97+
"""
98+
# Load graph and place_holders:
99+
with tf.io.gfile.GFile(deepspeech_pb_path, "rb") as f:
100+
graph_def = tf.compat.v1.GraphDef()
101+
graph_def.ParseFromString(f.read())
102+
103+
graph = tf.compat.v1.get_default_graph()
104+
tf.import_graph_def(graph_def, name="deepspeech")
105+
logits_ph = graph.get_tensor_by_name("deepspeech/logits:0")
106+
input_node_ph = graph.get_tensor_by_name("deepspeech/input_node:0")
107+
input_lengths_ph = graph.get_tensor_by_name("deepspeech/input_lengths:0")
108+
109+
return graph, logits_ph, input_node_ph, input_lengths_ph
110+
111+
112+
def pure_conv_audio_to_deepspeech(audio,
113+
audio_sample_rate,
114+
audio_window_size,
115+
audio_window_stride,
116+
num_frames,
117+
net_fn):
118+
"""
119+
Core routine for converting audion into DeepSpeech features.
120+
121+
Parameters
122+
----------
123+
audio : np.array
124+
Audio data.
125+
audio_sample_rate : int
126+
Audio sample rate.
127+
audio_window_size : int
128+
Audio window size.
129+
audio_window_stride : int
130+
Audio window stride.
131+
num_frames : int or None
132+
Numbers of frames.
133+
net_fn : func
134+
Function for DeepSpeech model call.
135+
136+
Returns
137+
-------
138+
np.array
139+
DeepSpeech features.
140+
"""
141+
target_sample_rate = 16000
142+
if audio_sample_rate != target_sample_rate:
143+
resampled_audio = resampy.resample(
144+
x=audio.astype(np.float),
145+
sr_orig=audio_sample_rate,
146+
sr_new=target_sample_rate)
147+
else:
148+
resampled_audio = audio.astype(np.float)
149+
input_vector = conv_audio_to_deepspeech_input_vector(
150+
audio=resampled_audio.astype(np.int16),
151+
sample_rate=target_sample_rate,
152+
num_cepstrum=26,
153+
num_context=9)
154+
155+
network_output = net_fn(input_vector)
156+
# print(network_output.shape)
157+
158+
deepspeech_fps = 50
159+
video_fps = 50 # Change this option if video fps is different
160+
audio_len_s = float(audio.shape[0]) / audio_sample_rate
161+
if num_frames is None:
162+
num_frames = int(round(audio_len_s * video_fps))
163+
else:
164+
video_fps = num_frames / audio_len_s
165+
network_output = interpolate_features(
166+
features=network_output[:, 0],
167+
input_rate=deepspeech_fps,
168+
output_rate=video_fps,
169+
output_len=num_frames)
170+
171+
# Make windows:
172+
zero_pad = np.zeros((int(audio_window_size / 2), network_output.shape[1]))
173+
network_output = np.concatenate(
174+
(zero_pad, network_output, zero_pad), axis=0)
175+
windows = []
176+
for window_index in range(0, network_output.shape[0] - audio_window_size, audio_window_stride):
177+
windows.append(
178+
network_output[window_index:window_index + audio_window_size])
179+
180+
return np.array(windows)
181+
182+
183+
def conv_audio_to_deepspeech_input_vector(audio,
184+
sample_rate,
185+
num_cepstrum,
186+
num_context):
187+
"""
188+
Convert audio raw data into DeepSpeech input vector.
189+
190+
Parameters
191+
----------
192+
audio : np.array
193+
Audio data.
194+
audio_sample_rate : int
195+
Audio sample rate.
196+
num_cepstrum : int
197+
Number of cepstrum.
198+
num_context : int
199+
Number of context.
200+
201+
Returns
202+
-------
203+
np.array
204+
DeepSpeech input vector.
205+
"""
206+
# Get mfcc coefficients:
207+
features = mfcc(
208+
signal=audio,
209+
samplerate=sample_rate,
210+
numcep=num_cepstrum)
211+
212+
# We only keep every second feature (BiRNN stride = 2):
213+
features = features[::2]
214+
215+
# One stride per time step in the input:
216+
num_strides = len(features)
217+
218+
# Add empty initial and final contexts:
219+
empty_context = np.zeros((num_context, num_cepstrum), dtype=features.dtype)
220+
features = np.concatenate((empty_context, features, empty_context))
221+
222+
# Create a view into the array with overlapping strides of size
223+
# numcontext (past) + 1 (present) + numcontext (future):
224+
window_size = 2 * num_context + 1
225+
train_inputs = np.lib.stride_tricks.as_strided(
226+
features,
227+
shape=(num_strides, window_size, num_cepstrum),
228+
strides=(features.strides[0],
229+
features.strides[0], features.strides[1]),
230+
writeable=False)
231+
232+
# Flatten the second and third dimensions:
233+
train_inputs = np.reshape(train_inputs, [num_strides, -1])
234+
235+
train_inputs = np.copy(train_inputs)
236+
train_inputs = (train_inputs - np.mean(train_inputs)) / \
237+
np.std(train_inputs)
238+
239+
return train_inputs
240+
241+
242+
def interpolate_features(features,
243+
input_rate,
244+
output_rate,
245+
output_len):
246+
"""
247+
Interpolate DeepSpeech features.
248+
249+
Parameters
250+
----------
251+
features : np.array
252+
DeepSpeech features.
253+
input_rate : int
254+
input rate (FPS).
255+
output_rate : int
256+
Output rate (FPS).
257+
output_len : int
258+
Output data length.
259+
260+
Returns
261+
-------
262+
np.array
263+
Interpolated data.
264+
"""
265+
input_len = features.shape[0]
266+
num_features = features.shape[1]
267+
input_timestamps = np.arange(input_len) / float(input_rate)
268+
output_timestamps = np.arange(output_len) / float(output_rate)
269+
output_features = np.zeros((output_len, num_features))
270+
for feature_idx in range(num_features):
271+
output_features[:, feature_idx] = np.interp(
272+
x=output_timestamps,
273+
xp=input_timestamps,
274+
fp=features[:, feature_idx])
275+
return output_features

0 commit comments

Comments
 (0)