From f4092a8d46bd511230b14ae0d495bcfe59fdbe66 Mon Sep 17 00:00:00 2001
From: Guanlong Zhao <guanlongzhao@gmail.com>
Date: Fri, 6 Sep 2019 16:54:36 -0500
Subject: [PATCH] further cleanup and doc updates.

---
 README.md                        |  61 +++++++++++++++----
 environment.yml                  |   4 +-
 src/common/hparams.py            |  22 ++++---
 src/common/utils.py              |  51 ++++++++++++++++
 src/script/generate_synthesis.py | 100 ++++++++-----------------------
 src/script/train_ppg2mel.py      |  19 +++---
 src/script/train_waveglow.py     |  17 ++----
 7 files changed, 155 insertions(+), 119 deletions(-)

diff --git a/README.md b/README.md
index 9217c06..fb53d04 100644
--- a/README.md
+++ b/README.md
@@ -1,41 +1,80 @@
-# Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams (accepted to Interspeech'19)
+# Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams (accepted to Interspeech'19) 
 
-**The current version is runnable but you probably won't figure out how, more docs on the way.** 
-
-PPG->Speech conversion code. This branch hosts the original code we used to
-prepare our interspeech'19 paper titled "Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams"
+This branch hosts the code we used to
+prepare our interspeech'19 paper titled "[Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams](https://psi.engr.tamu.edu/wp-content/uploads/2019/07/zhao2019interspeech.pdf)"
 
 ### Install
 
+This project uses `conda` to manage all the dependencies, you should install [anaconda](https://anaconda.org/) if you have not done so. 
+
 ```bash
-# Dependencies
+# Clone the repo
+git clone https://github.com/guanlongzhao/fac-via-ppg.git
+cd $PROJECT_ROOT_DIR
+
+# Install dependencies
 conda env create -f environment.yml
 
+# Activate the installed environment
+conda activate ppg-speech
+
 # Compile protocol buffer
 protoc -I=src/common --python_out=src/common src/common/data_utterance.proto
+
+# Include src in your PYTHONPATH
+export PYTHONPATH=$PROJECT_ROOT_DIR/src:$PYTHONPATH
 ```
 
+If `conda` complains that some packages are missing, it is very likely that you can find a similar version of that package on anaconda's archive.
+
 ### Run unit tests
 
 ```bash
 cd test
+
+# Remember to make this script executable
 ./run_coverage.sh
 ```
 
-### Train
-Change default parameters in `hparams.py`
+This only does a few sanity checks, don't worry if the test coverage looks low :)
+
+### Train PPG-to-Mel model
+Change default parameters in `src/common/hparams.py:create_hparams()`
+The training and validation data should be specified in text files, see `data/filelists` for examples.
+
 ```bash
 cd src/script
-python train.py
+python train_ppg2mel.py
 ```
+The `FP16` mode will not work, unfortunately.
+
+### Train WaveGlow model
+Change the default parameters in `src/waveglow/config.json`. The training data should be specified in the same manner as the PPG-to-Mel model.
+
+```bash
+cd src/script
+python train_waveglow.py
+```
+
+### View training progress
+You should find a dir `log` in all of your output dirs, that is the `LOG_DIR` you should use below.
 
-### View progress
 ```bash
 tensorboard --logdir=${LOG_DIR}
 ```
 
+### Generate speech synthesis
+Use `src/script/generate_synthesis.py`, you can find pre-trained models in the [Links](#Links) section.
+
+```bash
+generate_synthesis.py [-h] --ppg2mel_model PPG2MEL_MODEL
+                           --waveglow_model WAVEGLOW_MODEL
+                           --teacher_utterance_path TEACHER_UTTERANCE_PATH
+                           --output_dir OUTPUT_DIR
+```
+
 ### Links
 
-- Syntheses and pretraind models: [link](https://drive.google.com/file/d/1nye-CAGyz3diM5Q80s0iuBYgcIL_cqrs/view?usp=sharing)
+- Syntheses and pre-trained models: [link](https://drive.google.com/file/d/1nye-CAGyz3diM5Q80s0iuBYgcIL_cqrs/view?usp=sharing)
 - Training data (L2-ARCTIC recordings after noise removal): [link](https://drive.google.com/file/d/1WnBHAfjEKdFTBDv5D6DxRnlcvfiODBgy/view?usp=sharing)
 - Demo: [link](https://guanlongzhao.github.io/demo/fac-via-ppg)
diff --git a/environment.yml b/environment.yml
index fb77f98..d2565d5 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,4 +1,4 @@
-name: ppg-speech-lite
+name: ppg-speech
 channels:
   - pytorch
   - pykaldi
@@ -100,5 +100,5 @@ dependencies:
   - pip:
     - textgrid==1.4
     - torch==1.0.0
-prefix: /home/guanlong/anaconda2/envs/ppg-speech-lite
+prefix: /home/guanlong/anaconda2/envs/ppg-speech
 
diff --git a/src/common/hparams.py b/src/common/hparams.py
index bb79ab3..4bfd452 100644
--- a/src/common/hparams.py
+++ b/src/common/hparams.py
@@ -55,7 +55,8 @@ def create_hparams(**kwargs):
         "cudnn_enabled": True,
         "cudnn_benchmark": False,
         "output_directory": None,  # Directory to save checkpoints.
-        "log_directory": 'log',  # Directory to save tensorboard logs.
+        # Directory to save tensorboard logs. Just keep it like this.
+        "log_directory": 'log',
         "checkpoint_path": '',  # Path to a checkpoint file.
         "warm_start": False,  # Load the model only (warm start)
         "n_gpus": 1,  # Number of GPUs
@@ -65,8 +66,12 @@ def create_hparams(**kwargs):
         ################################
         # Data Parameters             #
         ################################
-        "training_files": '/home/guanlong/PycharmProjects/fac-via-ppg/data/filelists/ykwk_train_filelist_noise_reduced_lite.txt',
-        "validation_files": '/home/guanlong/PycharmProjects/fac-via-ppg/data/filelists/ykwk_val_filelist_noise_reduced.txt',
+        # Passed as a txt file, see data/filelists/training-set.txt for an
+        # example.
+        "training_files": '',
+        # Passed as a txt file, see data/filelists/validation-set.txt for an
+        # example.
+        "validation_files": '',
         "is_full_ppg": True,  # Whether to use the full PPG or not.
         "is_append_f0": False,  # Currently only effective at sentence level
         "ppg_subsampling_factor": 1,  # Sub-sample the ppg & acoustic sequence.
@@ -76,12 +81,11 @@ def create_hparams(**kwargs):
         # |True                  |False           |Please set cache path
         # |False                 |True            |Overwrite the cache path
         # |False                 |False           |Ignores the cache path
-        "load_feats_from_disk": True,  # Remember to set the path.
+        "load_feats_from_disk": False,  # Remember to set the path.
         # Mutually exclusive with 'load_feats_from_disk', will overwrite
         # 'feats_cache_path' if set.
         "is_cache_feats": False,
-        "feats_cache_path":
-            '/data_repo/arctic/cache/ykwk_feat_cache_noise_reduced_lite.pkl',
+        "feats_cache_path": '',
 
         ################################
         # Audio Parameters             #
@@ -98,7 +102,6 @@ def create_hparams(**kwargs):
         ################################
         # Model Parameters             #
         ################################
-        # For chain 8629, for fc 5816, for mono 40, for mono+f0 43
         "n_symbols": 5816,
         "symbols_embedding_dim": 600,
 
@@ -156,7 +159,10 @@ def create_hparams(**kwargs):
 
 
 def create_hparams_stage(**kwargs):
-    """Create model hyperparameters. Parse nondefault from given string."""
+    """Create model hyperparameters. Parse nondefault from given string.
+
+    These are the parameters used for our interspeech 2019 submission.
+    """
 
     hparams = {
         'attention_dim': 150,
diff --git a/src/common/utils.py b/src/common/utils.py
index 80039f6..1fe7b48 100644
--- a/src/common/utils.py
+++ b/src/common/utils.py
@@ -128,3 +128,54 @@ def notch_filtering(wav, fs, w0, Q):
     wav = signal.lfilter(b, a, wav)
     return wav
 
+
+def get_mel(wav, stft):
+    audio = torch.FloatTensor(wav.astype(np.float32))
+    audio_norm = audio / 32768
+    audio_norm = audio_norm.unsqueeze(0)
+    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+    # (1, n_mel_channels, T)
+    acoustic_feats = stft.mel_spectrogram(audio_norm)
+    return acoustic_feats
+
+
+def waveglow_audio(mel, waveglow, sigma, is_cuda_output=False):
+    mel = torch.autograd.Variable(mel.cuda())
+    if not is_cuda_output:
+        with torch.no_grad():
+            audio = 32768 * waveglow.infer(mel, sigma=sigma)[0]
+        audio = audio.cpu().numpy()
+        audio = audio.astype('int16')
+    else:
+        with torch.no_grad():
+            audio = waveglow.infer(mel, sigma=sigma).cuda()
+    return audio
+
+
+def get_inference(seq, model, is_clip=False):
+    """Tacotron inference.
+
+    Args:
+        seq: T*D numpy array.
+        model: Tacotron model.
+        is_clip: Set to True to avoid the artifacts at the end.
+
+    Returns:
+        synthesized mels.
+    """
+    # (T, D) numpy -> (1, D, T) cpu tensor
+    seq = torch.from_numpy(seq).float().transpose(0, 1).unsqueeze(0)
+    # cpu tensor -> gpu tensor
+    seq = to_gpu(seq)
+    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(seq)
+    if is_clip:
+        return mel_outputs_postnet[:, :, 10:(seq.size(2)-10)]
+    else:
+        return mel_outputs_postnet
+
+
+def load_waveglow_model(path):
+    model = torch.load(path)['model']
+    model = model.remove_weightnorm(model)
+    model.cuda().eval()
+    return model
\ No newline at end of file
diff --git a/src/script/generate_synthesis.py b/src/script/generate_synthesis.py
index f60e8df..07c3810 100644
--- a/src/script/generate_synthesis.py
+++ b/src/script/generate_synthesis.py
@@ -12,86 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from common.data_utils import get_ppg
 from common.hparams import create_hparams_stage
-from script.train_ppg2mel import load_model
-from common.utils import to_gpu
 from common.layers import TacotronSTFT
-from common import feat
+from common.utils import waveglow_audio, get_inference, load_waveglow_model
 from scipy.io import wavfile
-import numpy as np
-import sys
-import torch
-import ppg
-import os
-import logging
-import datetime
-import time
-# sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..',
-#                              'src', 'waveglow'))
+from script.train_ppg2mel import load_model
 from waveglow.denoiser import Denoiser
-from common.data_utils import get_ppg
-
-
-def get_mel(wav, stft):
-    audio = torch.FloatTensor(wav.astype(np.float32))
-    audio_norm = audio / 32768
-    audio_norm = audio_norm.unsqueeze(0)
-    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
-    # (1, n_mel_channels, T)
-    acoustic_feats = stft.mel_spectrogram(audio_norm)
-    return acoustic_feats
-
-
-def waveglow_audio(mel, waveglow, sigma, is_cuda_output=False):
-    mel = torch.autograd.Variable(mel.cuda())
-    if not is_cuda_output:
-        with torch.no_grad():
-            audio = 32768 * waveglow.infer(mel, sigma=sigma)[0]
-        audio = audio.cpu().numpy()
-        audio = audio.astype('int16')
-    else:
-        with torch.no_grad():
-            audio = waveglow.infer(mel, sigma=sigma).cuda()
-    return audio
-
-
-def get_inference(seq, model, is_clip=False):
-    """Tacotron inference.
-
-    Args:
-        seq: T*D numpy array.
-        model: Tacotron model.
-        is_clip: Set to True to avoid the artifacts at the end.
-
-    Returns:
-        synthesized mels.
-    """
-    # (T, D) numpy -> (1, D, T) cpu tensor
-    seq = torch.from_numpy(seq).float().transpose(0, 1).unsqueeze(0)
-    # cpu tensor -> gpu tensor
-    seq = to_gpu(seq)
-    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(seq)
-    if is_clip:
-        return mel_outputs_postnet[:, :, 10:(seq.size(2)-10)]
-    else:
-        return mel_outputs_postnet
-
-
-def load_waveglow_model(path):
-    model = torch.load(path)['model']
-    model = model.remove_weightnorm(model)
-    model.cuda().eval()
-    return model
+import argparse
+import logging
+import os
+import ppg
+import torch
 
 
 if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Generate accent conversion speech using pre-trained'
+                    'models.')
+    parser.add_argument('--ppg2mel_model', type=str, required=True,
+                        help='Path to the PPG-to-Mel model.')
+    parser.add_argument('--waveglow_model', type=str, required=True,
+                        help='Path to the WaveGlow model.')
+    parser.add_argument('--teacher_utterance_path', type=str, required=True,
+                        help='Path to a native speaker recording.')
+    parser.add_argument('--output_dir', type=str, required=True,
+                        help='Output dir, will save the audio and log info.')
+    args = parser.parse_args()
+
     # Prepare dirs
-    timestamp = datetime.datetime.fromtimestamp(time.time())
-    output_dir = \
-        '/media/guanlong/DATA1/exp/ppg-speech/samples/trial_%04d%02d%02d' \
-        '-%02d%02d%02d' \
-        % (timestamp.year, timestamp.month, timestamp.day, timestamp.hour,
-           timestamp.minute, timestamp.second)
+    output_dir = args.output_dir
     if not os.path.isdir(output_dir):
         os.mkdir(output_dir)
     logging.basicConfig(filename=os.path.join(output_dir, 'debug.log'),
@@ -99,9 +49,9 @@ def load_waveglow_model(path):
     logging.info('Output dir: %s', output_dir)
 
     # Parameters
-    checkpoint_path = ''
-    teacher_utt_path = ''
-    waveglow_path = ''
+    teacher_utt_path = args.teacher_utterance_path
+    checkpoint_path = args.ppg2mel_model
+    waveglow_path = args.waveglow_model
     is_clip = False  # Set to True to control the output length of AC.
     fs = 16000
     waveglow_sigma = 0.6
diff --git a/src/script/train_ppg2mel.py b/src/script/train_ppg2mel.py
index 96f354f..366308f 100644
--- a/src/script/train_ppg2mel.py
+++ b/src/script/train_ppg2mel.py
@@ -31,7 +31,6 @@
 
 """Modified from https://github.com/NVIDIA/tacotron2"""
 
-import datetime
 import os
 import time
 import math
@@ -280,19 +279,15 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
 if __name__ == '__main__':
     hparams = create_hparams()
 
-    # Prepare paths for this experiment, this helps avoid collisions.
-    timestamp = datetime.datetime.fromtimestamp(time.time())
-    exp_output_root_dir = \
-        '/media/guanlong/DATA1/exp/ppg-speech/tacotron/trial_%04d%02d%02d' \
-        '-%02d%02d%02d' \
-        % (timestamp.year, timestamp.month, timestamp.day, timestamp.hour,
-           timestamp.minute, timestamp.second)
-    os.mkdir(exp_output_root_dir)
-    if hparams.output_directory is None:
-        hparams.output_directory = os.path.join(exp_output_root_dir, 'output')
+    if not hparams.output_directory:
+        raise FileExistsError('Please specify the output dir.')
+    else:
+        if not os.path.exists(hparams.output_directory):
+            os.mkdir(hparams.output_directory)
 
     # Record the hyper-parameters.
-    hparams_snapshot_file = os.path.join(exp_output_root_dir, 'hparams.txt')
+    hparams_snapshot_file = os.path.join(hparams.output_directory,
+                                         'hparams.txt')
     with open(hparams_snapshot_file, 'w') as writer:
         pprint(hparams.__dict__, writer)
 
diff --git a/src/script/train_waveglow.py b/src/script/train_waveglow.py
index 2323c45..2c164d8 100644
--- a/src/script/train_waveglow.py
+++ b/src/script/train_waveglow.py
@@ -30,8 +30,6 @@
 import json
 import os
 import torch
-import datetime
-import time
 from common.logger import WaveglowLogger
 
 #=====START: ADDED FOR DISTRIBUTED======
@@ -148,6 +146,7 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
 
             iteration += 1
 
+
 if __name__ == "__main__":
     config_file_path = '../waveglow/config.json'
 
@@ -157,16 +156,12 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
     config = json.loads(data)
 
     # Prepare paths for this experiment, this helps avoid collisions.
-    timestamp = datetime.datetime.fromtimestamp(time.time())
-    exp_output_root_dir = \
-        '/media/guanlong/DATA1/exp/ppg-speech/waveglow/waveglow_%04d' \
-        '%02d%02d-%02d%02d%02d' % (
-            timestamp.year, timestamp.month, timestamp.day, timestamp.hour,
-            timestamp.minute, timestamp.second)
-    os.mkdir(exp_output_root_dir)
-    config["train_config"]["output_directory"] = exp_output_root_dir
+    if not os.path.exists(config["train_config"]["output_directory"]):
+        os.mkdir(config["train_config"]["output_directory"])
+
     # Stage the parameters.
-    config_snapshot_file = os.path.join(exp_output_root_dir, 'config.json')
+    config_snapshot_file = os.path.join(config["train_config"][
+                                             "output_directory"], 'config.json')
     with open(config_snapshot_file, 'w') as writer:
         json.dump(config, writer)