Skip to content

Commit f582624

Browse files
committed
Added dataset processing makefiles
1 parent a30010a commit f582624

9 files changed

+99
-22
lines changed

makefiles/Makefile.WhisperSpeech

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
LANGUAGE ?= en
2+
TRANSCRIPTION_MODEL ?= medium
3+
STOKS_MODEL ?= vq-sarah_teal.model
4+
STOKS_MODEL_SHORT = $(patsubst %.model,%,$(notdir $(STOKS_MODEL)))
5+
.SUFFIXES:
6+
.PRECIOUS: %.tar.gz
7+
8+
# main goal
9+
dataset:
10+
11+
# a list of all the derived datasets we can generate
12+
DATASETS := snr-c50 stoks-max-$(STOKS_MODEL_SHORT) encodec-3kbps $(TRANSCRIPTION_MODEL)-txt mvad spk_emb vad
13+
14+
# all source shards
15+
SRC_FILES := $(notdir $(wildcard $D/audio/*.tar))
16+
17+
# function to generate a list of derived shards
18+
derived = $(SRC_FILES:%=$D/$1/%.gz)
19+
20+
# create common targets for all the datasets
21+
define dataset_template
22+
all_$(1): $(call derived,$(1))
23+
.PHONY: all_$(1)
24+
25+
$D/$(1):
26+
mkdir -p $D/$(1)
27+
28+
dataset: all_$(1)
29+
dirs: $D/$(1)
30+
endef
31+
$(foreach ds,$(DATASETS),$(eval $(call dataset_template,$(ds))))
32+
33+
$D/vad/%.tar.gz: $D/audio/%.tar | $D/vad
34+
python -m whisperspeech.vad '$<' '$@'
35+
36+
$D/all-files: $(call derived,vad)
37+
parallel -j16 "tar tf {} | grep '.vad.npy$$' | sed -e s/.vad.npy//" ::: $^ > "$@"
38+
dataset: $D/all-files
39+
all_all-files: $D/all-files
40+
41+
$D/spk_emb/%.tar.gz: $D/audio/%.tar $D/vad/%.tar.gz | $D/spk_emb
42+
python -m whisperspeech.extract_spk_emb --batch_size 16 '$<' '$@'
43+
44+
$D/mvad/%: $D/vad/% $D/spk_emb/% | $D/mvad
45+
python -m whisperspeech.vad_merge '$<' '$@'
46+
47+
# These value of target-specific variables will be saved for each dataset.
48+
# This allows us to include multiple datasets (with different $D's) in a single global Makefile
49+
# and make sure the variables will be properly substituted in the command lists.
50+
$D/$(TRANSCRIPTION_MODEL)-txt/%.tar.gz: TRANSCRIPTION_MODEL:=$(TRANSCRIPTION_MODEL)
51+
$D/$(TRANSCRIPTION_MODEL)-txt/%.tar.gz: LANGUAGE:=$(LANGUAGE)
52+
$D/$(TRANSCRIPTION_MODEL)-txt/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz | $D/$(TRANSCRIPTION_MODEL)-txt
53+
python -m whisperspeech.prepare_t2s_txts --language="$(LANGUAGE)" --transcription_model="$(TRANSCRIPTION_MODEL)" "$<" "$@"
54+
55+
$D/encodec-3kbps/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz | $D/encodec-3kbps
56+
JOBS_PER_GPU=3 TIME_LIMIT=30:00 python -m whisperspeech.prepare_s2a_atoks --batch_size=4 "$<" "$@"
57+
58+
$D/stoks-max-$(STOKS_MODEL_SHORT)/%.tar.gz: STOKS_MODEL := $(STOKS_MODEL)
59+
$D/stoks-max-$(STOKS_MODEL_SHORT)/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz | $D/stoks-max-$(STOKS_MODEL_SHORT)
60+
JOBS_PER_GPU=2 TIME_LIMIT=30:00 python -m whisperspeech.extract_stoks --vq_model "$(STOKS_MODEL)" --batch_size=8 "$<" "$@"
61+
62+
$D/snr-c50/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz
63+
JOBS_PER_GPU=2 TIME_LIMIT=30:00 python -m whisperspeech.extract_metrics "$<" "$@"
64+
65+
# We don't need to make $TRANSCRIPTION_MODEL target-specific here since it will be baked into
66+
# the rule prereqs and later we only use the result via the target-specific $^ variable.
67+
# Same logic applies to $D (and it's use in $@).
68+
$D/txt-samples.list: $(call derived,$(TRANSCRIPTION_MODEL)-txt)
69+
parallel tar tf {} ::: $^ | sed -e 's/\.txt//' > "$@"
70+
dataset: $D/txt-samples.list
71+
72+
$D/atoks-samples.list: $(call derived,encodec-3kbps) | $D/encodec-3kbps
73+
parallel tar tf {} ::: $^ | sed -e 's/\.atoks\.npy//' > "$@"
74+
dataset: $D/atoks-samples.list
75+
76+
$D/language: LANGUAGE:=$(LANGUAGE)
77+
$D/language:
78+
printf "%s" "$(LANGUAGE)" > "$@"
79+
dataset: $D/language
80+
all_language: $D/language

makefiles/Makefile.en-sample

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
TRANSCRIPTION_MODEL=small.en
2+
D := $(patsubst %/,%,$(dir $(lastword $(MAKEFILE_LIST))))
3+
4+
include $D/../Makefile.WhisperSpeech

makefiles/Makefile.mlang-sample

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
TRANSCRIPTION_MODEL := medium
2+
D := $(patsubst %/,%,$(dir $(lastword $(MAKEFILE_LIST))))
3+
LANGUAGE := $(word 2,$(subst -, ,$(notdir $(realpath $D))))
4+
5+
include $D/../Makefile.WhisperSpeech

makefiles/README.md

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
## WhisperSpeech dataset Makefiles
2+
3+
We are moving towards a make-based dataset processing setup. We have a `Makefile.WhisperSpeech`
4+
include file which contains all the rules for data processing and per-dataset Makefiles that
5+
configure the dataset-specific options. There is also a global Makefile that just includes all
6+
the per dataset makefiles.

whisperspeech/a2wav.py

-7
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,3 @@ def decode_to_notebook(self, atoks):
5050

5151
audio = self.decode(atoks)
5252
display(Audio(audio.cpu().numpy(), rate=24000))
53-
54-
def decode_to_playback(self, atoks):
55-
import sounddevice as sd
56-
audio = self.decode(atoks)
57-
audio_np = audio.cpu().numpy().squeeze()
58-
sd.play(audio_np, 24000)
59-
sd.wait()

whisperspeech/extract_spk_emb.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
# %% ../nbs/2A. Speaker Embeddings.ipynb 3
77
import os
8-
from os.path import expanduser
98
import sys
109

1110
from fastprogress import progress_bar
@@ -49,7 +48,7 @@ def process_shard(
4948
dl = chunked_dataset(input, bs=batch_size)
5049

5150
classifier = EncoderClassifier.from_hparams("speechbrain/spkrec-ecapa-voxceleb",
52-
savedir=expanduser("~/.cache/speechbrain/"),
51+
savedir=f"{os.environ['HOME']}/.cache/speechbrain/",
5352
run_opts = {"device": device})
5453

5554
with utils.AtomicTarWriter(utils.derived_name(input, f'spk_emb')) as sink:

whisperspeech/extract_stoks.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# %% ../nbs/3B. Semantic token extraction.ipynb 2
77
import sys
88
import os
9-
from os.path import expanduser
109
import itertools
1110
from pathlib import Path
1211

@@ -41,7 +40,7 @@ def prepare_stoks(
4140
# vq_model.encode_mel = torch.compile(vq_model.encode_mel, mode="reduce-overhead", fullgraph=True)
4241

4342
spk_classifier = EncoderClassifier.from_hparams("speechbrain/spkrec-ecapa-voxceleb",
44-
savedir=expanduser("~/.cache/speechbrain/"),
43+
savedir=f"{os.environ['HOME']}/.cache/speechbrain/",
4544
run_opts = {"device": device})
4645

4746
total = n_samples//batch_size if n_samples else 'noinfer'

whisperspeech/fetch_models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
__all__ = []
55

66
# %% ../nbs/0. Download models.ipynb 1
7+
from os.path import expanduser
78
from fastcore.script import call_parse
89
import whisperx
910
import whisper
1011
from speechbrain.pretrained import EncoderClassifier
11-
from os.path import expanduser
1212

1313
# %% ../nbs/0. Download models.ipynb 3
1414
def load_whisperx(model, lang):

whisperspeech/pipeline.py

+1-10
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def extract_spk_emb(self, fname):
7575
if device == 'mps': device = 'cpu' # operator 'aten::_fft_r2c' is not currently implemented for the MPS device
7676
from speechbrain.pretrained import EncoderClassifier
7777
self.encoder = EncoderClassifier.from_hparams("speechbrain/spkrec-ecapa-voxceleb",
78-
savedir = expanduser("~/.cache/speechbrain/"),
78+
savedir=expanduser("~/.cache/speechbrain/"),
7979
run_opts={"device": device})
8080
audio_info = torchaudio.info(fname)
8181
actual_sample_rate = audio_info.sample_rate
@@ -105,12 +105,3 @@ def generate_to_file(self, fname, text, speaker=None, lang='en', cps=15, step_ca
105105

106106
def generate_to_notebook(self, text, speaker=None, lang='en', cps=15, step_callback=None):
107107
self.vocoder.decode_to_notebook(self.generate_atoks(text, speaker, lang=lang, cps=cps, step_callback=None))
108-
109-
def generate_to_playback(self, text, speaker=None, lang='en', cps=15, step_callback=None):
110-
try:
111-
import sounddevice as sd
112-
except ImportError:
113-
print("\033[93mThe 'sounddevice' library is required for direct text to playback functionality. Please install it using 'pip install sounddevice'.\033[0m")
114-
return
115-
116-
self.vocoder.decode_to_playback(self.generate_atoks(text, speaker, lang=lang, cps=cps, step_callback=step_callback))

0 commit comments

Comments
 (0)