|
| 1 | +LANGUAGE ?= en |
| 2 | +TRANSCRIPTION_MODEL ?= medium |
| 3 | +STOKS_MODEL ?= vq-sarah_teal.model |
| 4 | +STOKS_MODEL_SHORT = $(patsubst %.model,%,$(notdir $(STOKS_MODEL))) |
| 5 | +.SUFFIXES: |
| 6 | +.PRECIOUS: %.tar.gz |
| 7 | + |
| 8 | +# main goal |
| 9 | +dataset: |
| 10 | + |
| 11 | +# a list of all the derived datasets we can generate |
| 12 | +DATASETS := snr-c50 stoks-max-$(STOKS_MODEL_SHORT) encodec-3kbps $(TRANSCRIPTION_MODEL)-txt mvad spk_emb vad |
| 13 | + |
| 14 | +# all source shards |
| 15 | +SRC_FILES := $(notdir $(wildcard $D/audio/*.tar)) |
| 16 | + |
| 17 | +# function to generate a list of derived shards |
| 18 | +derived = $(SRC_FILES:%=$D/$1/%.gz) |
| 19 | + |
| 20 | +# create common targets for all the datasets |
| 21 | +define dataset_template |
| 22 | +all_$(1): $(call derived,$(1)) |
| 23 | +.PHONY: all_$(1) |
| 24 | + |
| 25 | +$D/$(1): |
| 26 | + mkdir -p $D/$(1) |
| 27 | + |
| 28 | +dataset: all_$(1) |
| 29 | +dirs: $D/$(1) |
| 30 | +endef |
| 31 | +$(foreach ds,$(DATASETS),$(eval $(call dataset_template,$(ds)))) |
| 32 | + |
| 33 | +$D/vad/%.tar.gz: $D/audio/%.tar | $D/vad |
| 34 | + python -m whisperspeech.vad '$<' '$@' |
| 35 | + |
| 36 | +$D/all-files: $(call derived,vad) |
| 37 | + parallel -j16 "tar tf {} | grep '.vad.npy$$' | sed -e s/.vad.npy//" ::: $^ > "$@" |
| 38 | +dataset: $D/all-files |
| 39 | +all_all-files: $D/all-files |
| 40 | + |
| 41 | +$D/spk_emb/%.tar.gz: $D/audio/%.tar $D/vad/%.tar.gz | $D/spk_emb |
| 42 | + python -m whisperspeech.extract_spk_emb --batch_size 16 '$<' '$@' |
| 43 | + |
| 44 | +$D/mvad/%: $D/vad/% $D/spk_emb/% | $D/mvad |
| 45 | + python -m whisperspeech.vad_merge '$<' '$@' |
| 46 | + |
| 47 | +# These value of target-specific variables will be saved for each dataset. |
| 48 | +# This allows us to include multiple datasets (with different $D's) in a single global Makefile |
| 49 | +# and make sure the variables will be properly substituted in the command lists. |
| 50 | +$D/$(TRANSCRIPTION_MODEL)-txt/%.tar.gz: TRANSCRIPTION_MODEL:=$(TRANSCRIPTION_MODEL) |
| 51 | +$D/$(TRANSCRIPTION_MODEL)-txt/%.tar.gz: LANGUAGE:=$(LANGUAGE) |
| 52 | +$D/$(TRANSCRIPTION_MODEL)-txt/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz | $D/$(TRANSCRIPTION_MODEL)-txt |
| 53 | + python -m whisperspeech.prepare_t2s_txts --language="$(LANGUAGE)" --transcription_model="$(TRANSCRIPTION_MODEL)" "$<" "$@" |
| 54 | + |
| 55 | +$D/encodec-3kbps/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz | $D/encodec-3kbps |
| 56 | + JOBS_PER_GPU=3 TIME_LIMIT=30:00 python -m whisperspeech.prepare_s2a_atoks --batch_size=4 "$<" "$@" |
| 57 | + |
| 58 | +$D/stoks-max-$(STOKS_MODEL_SHORT)/%.tar.gz: STOKS_MODEL := $(STOKS_MODEL) |
| 59 | +$D/stoks-max-$(STOKS_MODEL_SHORT)/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz | $D/stoks-max-$(STOKS_MODEL_SHORT) |
| 60 | + JOBS_PER_GPU=2 TIME_LIMIT=30:00 python -m whisperspeech.extract_stoks --vq_model "$(STOKS_MODEL)" --batch_size=8 "$<" "$@" |
| 61 | + |
| 62 | +$D/snr-c50/%.tar.gz: $D/audio/%.tar $D/mvad/%.tar.gz |
| 63 | + JOBS_PER_GPU=2 TIME_LIMIT=30:00 python -m whisperspeech.extract_metrics "$<" "$@" |
| 64 | + |
| 65 | +# We don't need to make $TRANSCRIPTION_MODEL target-specific here since it will be baked into |
| 66 | +# the rule prereqs and later we only use the result via the target-specific $^ variable. |
| 67 | +# Same logic applies to $D (and it's use in $@). |
| 68 | +$D/txt-samples.list: $(call derived,$(TRANSCRIPTION_MODEL)-txt) |
| 69 | + parallel tar tf {} ::: $^ | sed -e 's/\.txt//' > "$@" |
| 70 | +dataset: $D/txt-samples.list |
| 71 | + |
| 72 | +$D/atoks-samples.list: $(call derived,encodec-3kbps) | $D/encodec-3kbps |
| 73 | + parallel tar tf {} ::: $^ | sed -e 's/\.atoks\.npy//' > "$@" |
| 74 | +dataset: $D/atoks-samples.list |
| 75 | + |
| 76 | +$D/language: LANGUAGE:=$(LANGUAGE) |
| 77 | +$D/language: |
| 78 | + printf "%s" "$(LANGUAGE)" > "$@" |
| 79 | +dataset: $D/language |
| 80 | +all_language: $D/language |
0 commit comments