Added dataset processing makefiles

jpc · jpc · commit f582624ebf57 · 2024-04-24T10:16:20.000Z
diff --git a/makefiles/Makefile.WhisperSpeech b/makefiles/Makefile.WhisperSpeech
@@ -0,0 +1,80 @@
+LANGUAGE ?= en
+TRANSCRIPTION_MODEL ?= medium
+STOKS_MODEL ?= vq-sarah_teal.model
+STOKS_MODEL_SHORT = $(patsubst %.model,%,$(notdir $(STOKS_MODEL)))
+.SUFFIXES:
+.PRECIOUS: %.tar.gz
+
+# main goal
+dataset:
+
+# a list of all the derived datasets we can generate
+DATASETS := snr-c50 stoks-max-$(STOKS_MODEL_SHORT) encodec-3kbps $(TRANSCRIPTION_MODEL)-txt mvad spk_emb vad
+
+# all source shards
+SRC_FILES := $(notdir $(wildcard $D/audio/*.tar))
+
+# function to generate a list of derived shards
+derived = $(SRC_FILES:%=$D/$1/%.gz)
+
+# create common targets for all the datasets
+define dataset_template
+all_$(1): $(call derived,$(1))
+.PHONY: all_$(1)
+
+$D/$(1):
+	mkdir -p $D/$(1)
+
+dataset: all_$(1)
+dirs: $D/$(1)
+endef
+$(foreach ds,$(DATASETS),$(eval $(call dataset_template,$(ds))))
+
+$D/vad/%.tar.gz: $D/audio/%.tar | $D/vad
+	python -m whisperspeech.vad '$<' '$@'
+
+$D/all-files: $(call derived,vad)
+	parallel -j16 "tar tf {} | grep '.vad.npy$$' | sed -e s/.vad.npy//" ::: $^ > "$@"
+dataset: $D/all-files
+all_all-files: $D/all-files
+
+$D/spk_emb/%.tar.gz: $D/audio/%.tar $D/vad/%.tar.gz | $D/spk_emb
+	python -m whisperspeech.extract_spk_emb --batch_size 16 '$<' '$@'
+
+$D/mvad/%: $D/vad/% $D/spk_emb/% | $D/mvad
+	python -m whisperspeech.vad_merge '$<' '$@'
+
+# These value of target-specific variables will be saved for each dataset.
+# This allows us to include multiple datasets (with different $D's) in a single global Makefile
+# and make sure the variables will be properly substituted in the command lists.
+$D/$(TRANSCRIPTION_MODEL)-txt/%.tar.gz: TRANSCRIPTION_MODEL:=$(TRANSCRIPTION_MODEL)
+$D/$(TRANSCRIPTION_MODEL)-txt/%.tar.gz:	LANGUAGE:=$(LANGUAGE)
+$D/$(TRANSCRIPTION_MODEL)-txt/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz | $D/$(TRANSCRIPTION_MODEL)-txt
+	python -m whisperspeech.prepare_t2s_txts --language="$(LANGUAGE)" --transcription_model="$(TRANSCRIPTION_MODEL)" "$<" "$@"
+
+$D/encodec-3kbps/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz | $D/encodec-3kbps
+	JOBS_PER_GPU=3 TIME_LIMIT=30:00 python -m whisperspeech.prepare_s2a_atoks --batch_size=4 "$<" "$@"
+
+$D/stoks-max-$(STOKS_MODEL_SHORT)/%.tar.gz: STOKS_MODEL := $(STOKS_MODEL)
+$D/stoks-max-$(STOKS_MODEL_SHORT)/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz | $D/stoks-max-$(STOKS_MODEL_SHORT)
+	JOBS_PER_GPU=2 TIME_LIMIT=30:00 python -m whisperspeech.extract_stoks --vq_model "$(STOKS_MODEL)" --batch_size=8 "$<" "$@"
+
+$D/snr-c50/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz
+	JOBS_PER_GPU=2 TIME_LIMIT=30:00 python -m whisperspeech.extract_metrics	"$<" "$@"
+
+# We don't need to make $TRANSCRIPTION_MODEL target-specific here since it will be baked into
+# the rule prereqs and later we only use the result via the target-specific $^ variable.
+# Same logic applies to $D (and it's use in $@).
+$D/txt-samples.list: $(call derived,$(TRANSCRIPTION_MODEL)-txt)
+	parallel tar tf {} ::: $^ | sed -e 's/\.txt//' > "$@"
+dataset: $D/txt-samples.list
+
+$D/atoks-samples.list: $(call derived,encodec-3kbps) | $D/encodec-3kbps
+	parallel tar tf {} ::: $^ | sed -e 's/\.atoks\.npy//' > "$@"
+dataset: $D/atoks-samples.list
+
+$D/language: LANGUAGE:=$(LANGUAGE)
+$D/language:
+	printf "%s" "$(LANGUAGE)" > "$@"
+dataset: $D/language
+all_language: $D/language
diff --git a/makefiles/Makefile.en-sample b/makefiles/Makefile.en-sample
@@ -0,0 +1,4 @@
+TRANSCRIPTION_MODEL=small.en
+D := $(patsubst %/,%,$(dir $(lastword $(MAKEFILE_LIST))))
+
+include $D/../Makefile.WhisperSpeech
diff --git a/makefiles/Makefile.mlang-sample b/makefiles/Makefile.mlang-sample
@@ -0,0 +1,5 @@
+TRANSCRIPTION_MODEL := medium
+D := $(patsubst %/,%,$(dir $(lastword $(MAKEFILE_LIST))))
+LANGUAGE := $(word 2,$(subst -, ,$(notdir $(realpath $D))))
+
+include $D/../Makefile.WhisperSpeech
diff --git a/makefiles/README.md b/makefiles/README.md
@@ -0,0 +1,6 @@
+## WhisperSpeech dataset Makefiles
+
+We are moving towards a make-based dataset processing setup. We have a `Makefile.WhisperSpeech`
+include file which contains all the rules for data processing and per-dataset Makefiles that
+configure the dataset-specific options. There is also a global Makefile that just includes all
+the per dataset makefiles.
diff --git a/whisperspeech/a2wav.py b/whisperspeech/a2wav.py
@@ -50,10 +50,3 @@ def decode_to_notebook(self, atoks):
 
         audio = self.decode(atoks)
         display(Audio(audio.cpu().numpy(), rate=24000))
-
-    def decode_to_playback(self, atoks):
-        import sounddevice as sd
-        audio = self.decode(atoks)
-        audio_np = audio.cpu().numpy().squeeze()
-        sd.play(audio_np, 24000)
-        sd.wait()
diff --git a/whisperspeech/extract_spk_emb.py b/whisperspeech/extract_spk_emb.py
@@ -5,7 +5,6 @@
 
 # %% ../nbs/2A. Speaker Embeddings.ipynb 3
 import os
-from os.path import expanduser
 import sys
 
 from fastprogress import progress_bar
@@ -49,7 +48,7 @@ def process_shard(
     dl = chunked_dataset(input, bs=batch_size)
     
     classifier = EncoderClassifier.from_hparams("speechbrain/spkrec-ecapa-voxceleb",
-                                                savedir=expanduser("~/.cache/speechbrain/"),
+                                                savedir=f"{os.environ['HOME']}/.cache/speechbrain/",
                                                 run_opts = {"device": device})
     
     with utils.AtomicTarWriter(utils.derived_name(input, f'spk_emb')) as sink:
diff --git a/whisperspeech/extract_stoks.py b/whisperspeech/extract_stoks.py
@@ -6,7 +6,6 @@
 # %% ../nbs/3B. Semantic token extraction.ipynb 2
 import sys
 import os
-from os.path import expanduser
 import itertools
 from pathlib import Path
 
@@ -41,7 +40,7 @@ def prepare_stoks(
 #     vq_model.encode_mel = torch.compile(vq_model.encode_mel, mode="reduce-overhead", fullgraph=True)
     
     spk_classifier = EncoderClassifier.from_hparams("speechbrain/spkrec-ecapa-voxceleb",
-                                                    savedir=expanduser("~/.cache/speechbrain/"),
+                                                    savedir=f"{os.environ['HOME']}/.cache/speechbrain/",
                                                     run_opts = {"device": device})
     
     total = n_samples//batch_size if n_samples else 'noinfer'
diff --git a/whisperspeech/fetch_models.py b/whisperspeech/fetch_models.py
@@ -4,11 +4,11 @@
 __all__ = []
 
 # %% ../nbs/0. Download models.ipynb 1
+from os.path import expanduser
 from fastcore.script import call_parse
 import whisperx
 import whisper
 from speechbrain.pretrained import EncoderClassifier
-from os.path import expanduser
 
 # %% ../nbs/0. Download models.ipynb 3
 def load_whisperx(model, lang):
diff --git a/whisperspeech/pipeline.py b/whisperspeech/pipeline.py
@@ -75,7 +75,7 @@ def extract_spk_emb(self, fname):
             if device == 'mps': device = 'cpu' # operator 'aten::_fft_r2c' is not currently implemented for the MPS device
             from speechbrain.pretrained import EncoderClassifier
             self.encoder = EncoderClassifier.from_hparams("speechbrain/spkrec-ecapa-voxceleb",
-                                                          savedir = expanduser("~/.cache/speechbrain/"),
+                                                          savedir=expanduser("~/.cache/speechbrain/"),
                                                           run_opts={"device": device})
         audio_info = torchaudio.info(fname)
         actual_sample_rate = audio_info.sample_rate
@@ -105,12 +105,3 @@ def generate_to_file(self, fname, text, speaker=None, lang='en', cps=15, step_ca
         
     def generate_to_notebook(self, text, speaker=None, lang='en', cps=15, step_callback=None):
         self.vocoder.decode_to_notebook(self.generate_atoks(text, speaker, lang=lang, cps=cps, step_callback=None))
-
-    def generate_to_playback(self, text, speaker=None, lang='en', cps=15, step_callback=None):
-        try:
-            import sounddevice as sd
-        except ImportError:
-            print("\033[93mThe 'sounddevice' library is required for direct text to playback functionality. Please install it using 'pip install sounddevice'.\033[0m")
-            return
-
-        self.vocoder.decode_to_playback(self.generate_atoks(text, speaker, lang=lang, cps=cps, step_callback=step_callback))