jik876
diff --git a/‎LICENSE
+21 b/‎LICENSE
+21
diff --git a/‎LJSpeech-1.1/training.txt
+12,950 b/‎LJSpeech-1.1/training.txt
+12,950
diff --git a/‎LJSpeech-1.1/validation.txt
+150 b/‎LJSpeech-1.1/validation.txt
+150
diff --git a/‎README.md
+85 b/‎README.md
+85
diff --git a/‎config_v1.json
+37 b/‎config_v1.json
+37
diff --git a/‎config_v2.json
+37 b/‎config_v2.json
+37
diff --git a/‎config_v3.json
+37 b/‎config_v3.json
+37
diff --git a/‎env.py
+15 b/‎env.py
+15
diff --git a/‎inference.py
+95 b/‎inference.py
+95
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Jungil Kong
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,85 @@
+# HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
+
+### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
+
+In our [paper](https://arxiv.org/abs/2010.05646), 
+we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
+We provide our implementation and pretrained models as open source in this repository.
+
+**Abstract :**
+Several recent studies on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms. 
+Although such methods improve the sampling efficiency and memory usage, 
+their sample quality has not yet reached that of autoregressive and flow-based generative models. 
+In this study, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis. 
+As speech audio consists of sinusoidal signals with various periods, 
+we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality. 
+A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method 
+demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than 
+real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen 
+speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times 
+faster than real-time on CPU with comparable quality to an autoregressive counterpart.
+
+Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
+
+
+## Pre-requisites
+1. Python >= 3.6
+2. Clone this repository.
+3. Install python requirements. Please refer [requirements.txt](requirements.txt)
+4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
+And move all wav files to `LJSpeech-1.1/wavs`
+
+
+## Training
+```
+python train.py --config config_v1.json
+```
+To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
+Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
+You can change the path by adding `--checkpoint_path` option.
+
+
+## Pretrained Model
+You can also use pretrained models we provide.<br/>
+[Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/> 
+Details of each folder are as in follows:
+
+|Folder Name|Generator|Dataset|Fine-Tuned|
+|------|---|---|---|
+|LJ_V1|V1|LJSpeech|No|
+|LJ_V2|V2|LJSpeech|No|
+|LJ_V3|V3|LJSpeech|No|
+|LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
+|LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
+|LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
+|VCTK_V1|V1|VCTK|No|
+|VCTK_V2|V2|VCTK|No|
+|VCTK_V3|V3|VCTK|No|
+
+
+## Inference from wav file
+1. Make `test_files` directory and copy wav files into the directory.
+2. Run the following command.
+```
+python inference.py --checkpoint_file [generator checkpoint file path]
+```
+Generated wav files are saved in `generated_files` by default.<br>
+You can change the path by adding `--output_dir` option.
+
+
+## Inference for end-to-end speech synthesis
+1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
+You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2), 
+[Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
+2. Run the following command.
+```
+python inference_e2e.py --checkpoint_file [generator checkpoint file path]
+```
+Generated wav files are saved in `generated_files_from_mel` by default.<br>
+You can change the path by adding `--output_dir` option.
+
+
+## Acknowledgements
+We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips) 
+and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.
+
@@ -0,0 +1,37 @@
+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+
+    "upsample_rates": [8,8,2,2],
+    "upsample_kernel_sizes": [16,16,4,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+
+    "sampling_rate": 22050,
+
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+
+    "num_workers": 4,
+
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}
@@ -0,0 +1,37 @@
+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+
+    "upsample_rates": [8,8,2,2],
+    "upsample_kernel_sizes": [16,16,4,4],
+    "upsample_initial_channel": 128,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+
+    "sampling_rate": 22050,
+
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+
+    "num_workers": 4,
+
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}
@@ -0,0 +1,37 @@
+{
+    "resblock": "2",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+
+    "upsample_rates": [8,8,4],
+    "upsample_kernel_sizes": [16,16,8],
+    "upsample_initial_channel": 256,
+    "resblock_kernel_sizes": [3,5,7],
+    "resblock_dilation_sizes": [[1,2], [2,6], [3,12]],
+
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+
+    "sampling_rate": 22050,
+
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+
+    "num_workers": 4,
+
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}
@@ -0,0 +1,15 @@
+import os
+import shutil
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
@@ -0,0 +1,95 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import glob
+import os
+import argparse
+import json
+import torch
+from scipy.io.wavfile import write
+from env import AttrDict
+from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
+from models import Generator
+
+h = None
+device = None
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+
+
+def get_mel(x):
+    return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
+
+
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '*')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return ''
+    return sorted(cp_list)[-1]
+
+
+def inference(a):
+    generator = Generator(h).to(device)
+
+    state_dict_g = load_checkpoint(a.checkpoint_file, device)
+    generator.load_state_dict(state_dict_g['generator'])
+
+    filelist = os.listdir(a.input_wavs_dir)
+
+    os.makedirs(a.output_dir, exist_ok=True)
+
+    generator.eval()
+    generator.remove_weight_norm()
+    with torch.no_grad():
+        for i, filname in enumerate(filelist):
+            wav, sr = load_wav(os.path.join(a.input_wavs_dir, filname))
+            wav = wav / MAX_WAV_VALUE
+            wav = torch.FloatTensor(wav).to(device)
+            x = get_mel(wav.unsqueeze(0))
+            y_g_hat = generator(x)
+            audio = y_g_hat.squeeze()
+            audio = audio * MAX_WAV_VALUE
+            audio = audio.cpu().numpy().astype('int16')
+
+            output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated.wav')
+            write(output_file, h.sampling_rate, audio)
+            print(output_file)
+
+
+def main():
+    print('Initializing Inference Process..')
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_wavs_dir', default='test_files')
+    parser.add_argument('--output_dir', default='generated_files')
+    parser.add_argument('--checkpoint_file', required=True)
+    a = parser.parse_args()
+
+    config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
+    with open(config_file) as f:
+        data = f.read()
+
+    global h
+    json_config = json.loads(data)
+    h = AttrDict(json_config)
+
+    torch.manual_seed(h.seed)
+    global device
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(h.seed)
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+
+    inference(a)
+
+
+if __name__ == '__main__':
+    main()
+