From f4092a8d46bd511230b14ae0d495bcfe59fdbe66 Mon Sep 17 00:00:00 2001 From: Guanlong Zhao Date: Fri, 6 Sep 2019 16:54:36 -0500 Subject: [PATCH] further cleanup and doc updates. --- README.md | 61 +++++++++++++++---- environment.yml | 4 +- src/common/hparams.py | 22 ++++--- src/common/utils.py | 51 ++++++++++++++++ src/script/generate_synthesis.py | 100 ++++++++----------------------- src/script/train_ppg2mel.py | 19 +++--- src/script/train_waveglow.py | 17 ++---- 7 files changed, 155 insertions(+), 119 deletions(-) diff --git a/README.md b/README.md index 9217c06..fb53d04 100644 --- a/README.md +++ b/README.md @@ -1,41 +1,80 @@ -# Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams (accepted to Interspeech'19) +# Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams (accepted to Interspeech'19) -**The current version is runnable but you probably won't figure out how, more docs on the way.** - -PPG->Speech conversion code. This branch hosts the original code we used to -prepare our interspeech'19 paper titled "Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams" +This branch hosts the code we used to +prepare our interspeech'19 paper titled "[Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams](https://psi.engr.tamu.edu/wp-content/uploads/2019/07/zhao2019interspeech.pdf)" ### Install +This project uses `conda` to manage all the dependencies, you should install [anaconda](https://anaconda.org/) if you have not done so. + ```bash -# Dependencies +# Clone the repo +git clone https://github.com/guanlongzhao/fac-via-ppg.git +cd $PROJECT_ROOT_DIR + +# Install dependencies conda env create -f environment.yml +# Activate the installed environment +conda activate ppg-speech + # Compile protocol buffer protoc -I=src/common --python_out=src/common src/common/data_utterance.proto + +# Include src in your PYTHONPATH +export PYTHONPATH=$PROJECT_ROOT_DIR/src:$PYTHONPATH ``` +If `conda` complains that some packages are missing, it is very likely that you can find a similar version of that package on anaconda's archive. + ### Run unit tests ```bash cd test + +# Remember to make this script executable ./run_coverage.sh ``` -### Train -Change default parameters in `hparams.py` +This only does a few sanity checks, don't worry if the test coverage looks low :) + +### Train PPG-to-Mel model +Change default parameters in `src/common/hparams.py:create_hparams()` +The training and validation data should be specified in text files, see `data/filelists` for examples. + ```bash cd src/script -python train.py +python train_ppg2mel.py ``` +The `FP16` mode will not work, unfortunately. + +### Train WaveGlow model +Change the default parameters in `src/waveglow/config.json`. The training data should be specified in the same manner as the PPG-to-Mel model. + +```bash +cd src/script +python train_waveglow.py +``` + +### View training progress +You should find a dir `log` in all of your output dirs, that is the `LOG_DIR` you should use below. -### View progress ```bash tensorboard --logdir=${LOG_DIR} ``` +### Generate speech synthesis +Use `src/script/generate_synthesis.py`, you can find pre-trained models in the [Links](#Links) section. + +```bash +generate_synthesis.py [-h] --ppg2mel_model PPG2MEL_MODEL + --waveglow_model WAVEGLOW_MODEL + --teacher_utterance_path TEACHER_UTTERANCE_PATH + --output_dir OUTPUT_DIR +``` + ### Links -- Syntheses and pretraind models: [link](https://drive.google.com/file/d/1nye-CAGyz3diM5Q80s0iuBYgcIL_cqrs/view?usp=sharing) +- Syntheses and pre-trained models: [link](https://drive.google.com/file/d/1nye-CAGyz3diM5Q80s0iuBYgcIL_cqrs/view?usp=sharing) - Training data (L2-ARCTIC recordings after noise removal): [link](https://drive.google.com/file/d/1WnBHAfjEKdFTBDv5D6DxRnlcvfiODBgy/view?usp=sharing) - Demo: [link](https://guanlongzhao.github.io/demo/fac-via-ppg) diff --git a/environment.yml b/environment.yml index fb77f98..d2565d5 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: ppg-speech-lite +name: ppg-speech channels: - pytorch - pykaldi @@ -100,5 +100,5 @@ dependencies: - pip: - textgrid==1.4 - torch==1.0.0 -prefix: /home/guanlong/anaconda2/envs/ppg-speech-lite +prefix: /home/guanlong/anaconda2/envs/ppg-speech diff --git a/src/common/hparams.py b/src/common/hparams.py index bb79ab3..4bfd452 100644 --- a/src/common/hparams.py +++ b/src/common/hparams.py @@ -55,7 +55,8 @@ def create_hparams(**kwargs): "cudnn_enabled": True, "cudnn_benchmark": False, "output_directory": None, # Directory to save checkpoints. - "log_directory": 'log', # Directory to save tensorboard logs. + # Directory to save tensorboard logs. Just keep it like this. + "log_directory": 'log', "checkpoint_path": '', # Path to a checkpoint file. "warm_start": False, # Load the model only (warm start) "n_gpus": 1, # Number of GPUs @@ -65,8 +66,12 @@ def create_hparams(**kwargs): ################################ # Data Parameters # ################################ - "training_files": '/home/guanlong/PycharmProjects/fac-via-ppg/data/filelists/ykwk_train_filelist_noise_reduced_lite.txt', - "validation_files": '/home/guanlong/PycharmProjects/fac-via-ppg/data/filelists/ykwk_val_filelist_noise_reduced.txt', + # Passed as a txt file, see data/filelists/training-set.txt for an + # example. + "training_files": '', + # Passed as a txt file, see data/filelists/validation-set.txt for an + # example. + "validation_files": '', "is_full_ppg": True, # Whether to use the full PPG or not. "is_append_f0": False, # Currently only effective at sentence level "ppg_subsampling_factor": 1, # Sub-sample the ppg & acoustic sequence. @@ -76,12 +81,11 @@ def create_hparams(**kwargs): # |True |False |Please set cache path # |False |True |Overwrite the cache path # |False |False |Ignores the cache path - "load_feats_from_disk": True, # Remember to set the path. + "load_feats_from_disk": False, # Remember to set the path. # Mutually exclusive with 'load_feats_from_disk', will overwrite # 'feats_cache_path' if set. "is_cache_feats": False, - "feats_cache_path": - '/data_repo/arctic/cache/ykwk_feat_cache_noise_reduced_lite.pkl', + "feats_cache_path": '', ################################ # Audio Parameters # @@ -98,7 +102,6 @@ def create_hparams(**kwargs): ################################ # Model Parameters # ################################ - # For chain 8629, for fc 5816, for mono 40, for mono+f0 43 "n_symbols": 5816, "symbols_embedding_dim": 600, @@ -156,7 +159,10 @@ def create_hparams(**kwargs): def create_hparams_stage(**kwargs): - """Create model hyperparameters. Parse nondefault from given string.""" + """Create model hyperparameters. Parse nondefault from given string. + + These are the parameters used for our interspeech 2019 submission. + """ hparams = { 'attention_dim': 150, diff --git a/src/common/utils.py b/src/common/utils.py index 80039f6..1fe7b48 100644 --- a/src/common/utils.py +++ b/src/common/utils.py @@ -128,3 +128,54 @@ def notch_filtering(wav, fs, w0, Q): wav = signal.lfilter(b, a, wav) return wav + +def get_mel(wav, stft): + audio = torch.FloatTensor(wav.astype(np.float32)) + audio_norm = audio / 32768 + audio_norm = audio_norm.unsqueeze(0) + audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) + # (1, n_mel_channels, T) + acoustic_feats = stft.mel_spectrogram(audio_norm) + return acoustic_feats + + +def waveglow_audio(mel, waveglow, sigma, is_cuda_output=False): + mel = torch.autograd.Variable(mel.cuda()) + if not is_cuda_output: + with torch.no_grad(): + audio = 32768 * waveglow.infer(mel, sigma=sigma)[0] + audio = audio.cpu().numpy() + audio = audio.astype('int16') + else: + with torch.no_grad(): + audio = waveglow.infer(mel, sigma=sigma).cuda() + return audio + + +def get_inference(seq, model, is_clip=False): + """Tacotron inference. + + Args: + seq: T*D numpy array. + model: Tacotron model. + is_clip: Set to True to avoid the artifacts at the end. + + Returns: + synthesized mels. + """ + # (T, D) numpy -> (1, D, T) cpu tensor + seq = torch.from_numpy(seq).float().transpose(0, 1).unsqueeze(0) + # cpu tensor -> gpu tensor + seq = to_gpu(seq) + mel_outputs, mel_outputs_postnet, _, alignments = model.inference(seq) + if is_clip: + return mel_outputs_postnet[:, :, 10:(seq.size(2)-10)] + else: + return mel_outputs_postnet + + +def load_waveglow_model(path): + model = torch.load(path)['model'] + model = model.remove_weightnorm(model) + model.cuda().eval() + return model \ No newline at end of file diff --git a/src/script/generate_synthesis.py b/src/script/generate_synthesis.py index f60e8df..07c3810 100644 --- a/src/script/generate_synthesis.py +++ b/src/script/generate_synthesis.py @@ -12,86 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. +from common.data_utils import get_ppg from common.hparams import create_hparams_stage -from script.train_ppg2mel import load_model -from common.utils import to_gpu from common.layers import TacotronSTFT -from common import feat +from common.utils import waveglow_audio, get_inference, load_waveglow_model from scipy.io import wavfile -import numpy as np -import sys -import torch -import ppg -import os -import logging -import datetime -import time -# sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', -# 'src', 'waveglow')) +from script.train_ppg2mel import load_model from waveglow.denoiser import Denoiser -from common.data_utils import get_ppg - - -def get_mel(wav, stft): - audio = torch.FloatTensor(wav.astype(np.float32)) - audio_norm = audio / 32768 - audio_norm = audio_norm.unsqueeze(0) - audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) - # (1, n_mel_channels, T) - acoustic_feats = stft.mel_spectrogram(audio_norm) - return acoustic_feats - - -def waveglow_audio(mel, waveglow, sigma, is_cuda_output=False): - mel = torch.autograd.Variable(mel.cuda()) - if not is_cuda_output: - with torch.no_grad(): - audio = 32768 * waveglow.infer(mel, sigma=sigma)[0] - audio = audio.cpu().numpy() - audio = audio.astype('int16') - else: - with torch.no_grad(): - audio = waveglow.infer(mel, sigma=sigma).cuda() - return audio - - -def get_inference(seq, model, is_clip=False): - """Tacotron inference. - - Args: - seq: T*D numpy array. - model: Tacotron model. - is_clip: Set to True to avoid the artifacts at the end. - - Returns: - synthesized mels. - """ - # (T, D) numpy -> (1, D, T) cpu tensor - seq = torch.from_numpy(seq).float().transpose(0, 1).unsqueeze(0) - # cpu tensor -> gpu tensor - seq = to_gpu(seq) - mel_outputs, mel_outputs_postnet, _, alignments = model.inference(seq) - if is_clip: - return mel_outputs_postnet[:, :, 10:(seq.size(2)-10)] - else: - return mel_outputs_postnet - - -def load_waveglow_model(path): - model = torch.load(path)['model'] - model = model.remove_weightnorm(model) - model.cuda().eval() - return model +import argparse +import logging +import os +import ppg +import torch if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generate accent conversion speech using pre-trained' + 'models.') + parser.add_argument('--ppg2mel_model', type=str, required=True, + help='Path to the PPG-to-Mel model.') + parser.add_argument('--waveglow_model', type=str, required=True, + help='Path to the WaveGlow model.') + parser.add_argument('--teacher_utterance_path', type=str, required=True, + help='Path to a native speaker recording.') + parser.add_argument('--output_dir', type=str, required=True, + help='Output dir, will save the audio and log info.') + args = parser.parse_args() + # Prepare dirs - timestamp = datetime.datetime.fromtimestamp(time.time()) - output_dir = \ - '/media/guanlong/DATA1/exp/ppg-speech/samples/trial_%04d%02d%02d' \ - '-%02d%02d%02d' \ - % (timestamp.year, timestamp.month, timestamp.day, timestamp.hour, - timestamp.minute, timestamp.second) + output_dir = args.output_dir if not os.path.isdir(output_dir): os.mkdir(output_dir) logging.basicConfig(filename=os.path.join(output_dir, 'debug.log'), @@ -99,9 +49,9 @@ def load_waveglow_model(path): logging.info('Output dir: %s', output_dir) # Parameters - checkpoint_path = '' - teacher_utt_path = '' - waveglow_path = '' + teacher_utt_path = args.teacher_utterance_path + checkpoint_path = args.ppg2mel_model + waveglow_path = args.waveglow_model is_clip = False # Set to True to control the output length of AC. fs = 16000 waveglow_sigma = 0.6 diff --git a/src/script/train_ppg2mel.py b/src/script/train_ppg2mel.py index 96f354f..366308f 100644 --- a/src/script/train_ppg2mel.py +++ b/src/script/train_ppg2mel.py @@ -31,7 +31,6 @@ """Modified from https://github.com/NVIDIA/tacotron2""" -import datetime import os import time import math @@ -280,19 +279,15 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, if __name__ == '__main__': hparams = create_hparams() - # Prepare paths for this experiment, this helps avoid collisions. - timestamp = datetime.datetime.fromtimestamp(time.time()) - exp_output_root_dir = \ - '/media/guanlong/DATA1/exp/ppg-speech/tacotron/trial_%04d%02d%02d' \ - '-%02d%02d%02d' \ - % (timestamp.year, timestamp.month, timestamp.day, timestamp.hour, - timestamp.minute, timestamp.second) - os.mkdir(exp_output_root_dir) - if hparams.output_directory is None: - hparams.output_directory = os.path.join(exp_output_root_dir, 'output') + if not hparams.output_directory: + raise FileExistsError('Please specify the output dir.') + else: + if not os.path.exists(hparams.output_directory): + os.mkdir(hparams.output_directory) # Record the hyper-parameters. - hparams_snapshot_file = os.path.join(exp_output_root_dir, 'hparams.txt') + hparams_snapshot_file = os.path.join(hparams.output_directory, + 'hparams.txt') with open(hparams_snapshot_file, 'w') as writer: pprint(hparams.__dict__, writer) diff --git a/src/script/train_waveglow.py b/src/script/train_waveglow.py index 2323c45..2c164d8 100644 --- a/src/script/train_waveglow.py +++ b/src/script/train_waveglow.py @@ -30,8 +30,6 @@ import json import os import torch -import datetime -import time from common.logger import WaveglowLogger #=====START: ADDED FOR DISTRIBUTED====== @@ -148,6 +146,7 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iteration += 1 + if __name__ == "__main__": config_file_path = '../waveglow/config.json' @@ -157,16 +156,12 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, config = json.loads(data) # Prepare paths for this experiment, this helps avoid collisions. - timestamp = datetime.datetime.fromtimestamp(time.time()) - exp_output_root_dir = \ - '/media/guanlong/DATA1/exp/ppg-speech/waveglow/waveglow_%04d' \ - '%02d%02d-%02d%02d%02d' % ( - timestamp.year, timestamp.month, timestamp.day, timestamp.hour, - timestamp.minute, timestamp.second) - os.mkdir(exp_output_root_dir) - config["train_config"]["output_directory"] = exp_output_root_dir + if not os.path.exists(config["train_config"]["output_directory"]): + os.mkdir(config["train_config"]["output_directory"]) + # Stage the parameters. - config_snapshot_file = os.path.join(exp_output_root_dir, 'config.json') + config_snapshot_file = os.path.join(config["train_config"][ + "output_directory"], 'config.json') with open(config_snapshot_file, 'w') as writer: json.dump(config, writer)