diff --git a/README.md b/README.md index f7883f8c..b1c7890c 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ Visit our [demo](https://jaywalnut310.github.io/vits-demo/index.html) for audio We also provide the [pretrained models](https://drive.google.com/drive/folders/1ksarh-cJf3F5eKJjLVWY0X1j1qsQqiS2?usp=sharing). +** Update note: [Gradio Web Demo](https://gradio.app/hub/AK391/vits) + ** Update note: Thanks to [Rishikesh (ऋषिकेश)](https://github.com/jaywalnut310/vits/issues/1), our interactive TTS demo is now available on [Colab Notebook](https://colab.research.google.com/drive/1CO61pZizDj7en71NQG_aqqKdGaA_SaBf?usp=sharing). diff --git a/gradiodemo.py b/gradiodemo.py new file mode 100644 index 00000000..681f3065 --- /dev/null +++ b/gradiodemo.py @@ -0,0 +1,93 @@ +import matplotlib.pyplot as plt + +import os +import json +import math +import torch +from torch import nn +from torch.nn import functional as F +from torch.utils.data import DataLoader + +import commons +import utils +from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate + +import sys +from subprocess import call + +def run_cmd(command): + try: + print(command) + call(command, shell=True) + except KeyboardInterrupt: + print("Process interrupted") + sys.exit(1) + +current = os.getcwd() +print(current) +full = current + "/monotonic_align" +print(full) +os.chdir(full) +print(os.getcwd()) +run_cmd("python3 setup.py build_ext --inplace") +run_cmd("apt-get install espeak -y") +os.chdir("..") +print(os.getcwd()) + +from models import SynthesizerTrn +from text.symbols import symbols +from text import text_to_sequence + +from scipy.io.wavfile import write +import gradio as gr +import scipy.io.wavfile +import numpy as np +import torchtext + + + + + +torchtext.utils.download_from_url("https://drive.google.com/uc?id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT", root=".") + + +def get_text(text, hps): + text_norm = text_to_sequence(text, hps.data.text_cleaners) + if hps.data.add_blank: + text_norm = commons.intersperse(text_norm, 0) + text_norm = torch.LongTensor(text_norm) + return text_norm + +hps = utils.get_hparams_from_file("./configs/ljs_base.json") +net_g = SynthesizerTrn( + len(symbols), + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model) +_ = net_g.eval() + +_ = utils.load_checkpoint("pretrained_ljs.pth", net_g, None) +def inference(text): + stn_tst = get_text(text, hps) + with torch.no_grad(): + x_tst = stn_tst.unsqueeze(0) + x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) + audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy() + scipy.io.wavfile.write("out.wav", hps.data.sampling_rate, audio) + return "./out.wav" + + +inputs = gr.inputs.Textbox(lines=5, label="Input Text") +outputs = gr.outputs.File(label="Output Audio") + + +title = "VITS" +description = "demo for VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. To use it, simply add your text, or click one of the examples to load them. Read more at the links below." +article = "

Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech | Github Repo

" + +examples = [ + ["We propose VITS, Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech."], + ["Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling."] +] + +gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch() diff --git a/requirements.txt b/requirements.txt index ecacbad3..9c1a41b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,8 @@ numpy==1.18.5 phonemizer==2.2.1 scipy==1.5.2 tensorboard==2.3.0 -torch==1.6.0 -torchvision==0.7.0 +torch +torchvision Unidecode==1.1.1 +gradio +torchtext