-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathmelspectrograms.py
82 lines (62 loc) · 2.97 KB
/
melspectrograms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from argparse import ArgumentParser
from essentia.standard import *
import essentia
import numpy
def load_audio(filename, sampleRate=12000, segment_duration=None):
audio = MonoLoader(filename=filename, sampleRate=sampleRate, resampleQuality=4)()
if segment_duration:
segment_duration = round(segment_duration*sampleRate)
segment_start = (len(audio) - segment_duration) // 2
segment_end = segment_start + segment_duration
else:
segment_start = 0
segment_end = len(audio)
if segment_start < 0 or segment_end > len(audio):
raise ValueError('Segment duration is larger than the input audio duration')
return audio[segment_start:segment_end]
def melspectrogram(audio,
sampleRate=12000, frameSize=512, hopSize=256,
window='hann', zeroPadding=0, center=True,
numberBands=96, lowFrequencyBound=0, highFrequencyBound=None,
weighting='linear', warpingFormula='slaneyMel',
normalize='unit_tri'):
if highFrequencyBound is None:
highFrequencyBound = sampleRate/2
windowing = Windowing(type=window, normalized=False, zeroPadding=zeroPadding)
spectrum = Spectrum()
melbands = MelBands(numberBands=numberBands,
sampleRate=sampleRate,
lowFrequencyBound=lowFrequencyBound,
highFrequencyBound=highFrequencyBound,
inputSize=(frameSize+zeroPadding)//2+1,
weighting=weighting,
normalize=normalize,
warpingFormula=warpingFormula,
type='power')
amp2db = UnaryOperator(type='lin2db', scale=2)
pool = essentia.Pool()
for frame in FrameGenerator(audio,
frameSize=frameSize, hopSize=hopSize,
startFromZero=not center):
pool.add('mel', amp2db(melbands(spectrum(windowing(frame)))))
return pool['mel'].T
def analyze(audio_file, npy_file, full_audio):
if full_audio:
# Analyze full audio duration.
segment_duration=None
else:
# Duration for the Choi's VGG model.
segment_duration=29.1
audio = load_audio(audio_file, segment_duration=segment_duration)
mel = melspectrogram(audio)
numpy.save(npy_file, mel, allow_pickle=False)
return
if __name__ == '__main__':
parser = ArgumentParser(description="Computes a mel-spectrogram for an audio file. Results are stored to a NumPy "
"array binary file.")
parser.add_argument('audio_file', help='input audio file')
parser.add_argument('npy_file', help='output NPY file to store mel-spectrogram')
parser.add_argument('--full', dest='full_audio', help='analyze full audio instead of a centered 29.1s segment',
action='store_true')
args = parser.parse_args()
analyze(args.audio_file, args.npy_file, args.full_audio)