From e2fbba9cea9e060da4844558cefa960764b28a13 Mon Sep 17 00:00:00 2001 From: AK391 Date: Mon, 14 Jun 2021 17:48:46 +0000 Subject: [PATCH 01/18] gradio --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index ecacbad3..7dd1b352 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ tensorboard==2.3.0 torch==1.6.0 torchvision==0.7.0 Unidecode==1.1.1 +gradio From 91c1dbe1475d68a7d486986ee055f93a6122961e Mon Sep 17 00:00:00 2001 From: AK391 Date: Mon, 14 Jun 2021 17:50:46 +0000 Subject: [PATCH 02/18] gradio --- gradiodemo.py | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 gradiodemo.py diff --git a/gradiodemo.py b/gradiodemo.py new file mode 100644 index 00000000..6f440740 --- /dev/null +++ b/gradiodemo.py @@ -0,0 +1,60 @@ +%matplotlib inline +import matplotlib.pyplot as plt +import IPython.display as ipd + +import os +import json +import math +import torch +from torch import nn +from torch.nn import functional as F +from torch.utils.data import DataLoader + +import commons +import utils +from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate +from models import SynthesizerTrn +from text.symbols import symbols +from text import text_to_sequence + +from scipy.io.wavfile import write +import gradio as gr +import scipy.io.wavfile +import numpy as np + + +def get_text(text, hps): + text_norm = text_to_sequence(text, hps.data.text_cleaners) + if hps.data.add_blank: + text_norm = commons.intersperse(text_norm, 0) + text_norm = torch.LongTensor(text_norm) + return text_norm + +hps = utils.get_hparams_from_file("./configs/ljs_base.json") +net_g = SynthesizerTrn( + len(symbols), + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model) +_ = net_g.eval() + +_ = utils.load_checkpoint("pretrained_ljs.pth", net_g, None) +def inference(text): + stn_tst = get_text(text, hps) + with torch.no_grad(): + x_tst = stn_tst.unsqueeze(0) + x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) + audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy() + scipy.io.wavfile.write("out.wav", hps.data.sampling_rate, audio) + return "./out.wav" + + +inputs = gr.inputs.Textbox(label="Input Text") +outputs = gr.outputs.File(label="Output Audio") + + +title = "VITS" +description = "demo for VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. To use it, simply upload your text, or click one of the examples to load them. Read more at the links below." +article = "

Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech | Github Repo

" + +gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch(debug=True) \ No newline at end of file From 9c36366a7131d2c18f05b6b1f206e54e92434dfa Mon Sep 17 00:00:00 2001 From: AK391 Date: Mon, 14 Jun 2021 18:13:01 +0000 Subject: [PATCH 03/18] syntax fix --- gradiodemo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gradiodemo.py b/gradiodemo.py index 6f440740..5b5ee3b5 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -1,4 +1,3 @@ -%matplotlib inline import matplotlib.pyplot as plt import IPython.display as ipd From 3c858286c48bd8cb417e3879ed708a44ba0051fb Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 14:25:20 -0400 Subject: [PATCH 04/18] remove --- gradiodemo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gradiodemo.py b/gradiodemo.py index 5b5ee3b5..90467ef9 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -1,5 +1,4 @@ import matplotlib.pyplot as plt -import IPython.display as ipd import os import json @@ -56,4 +55,4 @@ def inference(text): description = "demo for VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. To use it, simply upload your text, or click one of the examples to load them. Read more at the links below." article = "

Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech | Github Repo

" -gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch(debug=True) \ No newline at end of file +gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch(debug=True) From 1b8777f9449fe27f32ff62ec3aa69e926855ab13 Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 14:39:15 -0400 Subject: [PATCH 05/18] add model --- gradiodemo.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gradiodemo.py b/gradiodemo.py index 90467ef9..34f31d7a 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -19,6 +19,9 @@ import gradio as gr import scipy.io.wavfile import numpy as np +import torchtext + +torchtext.utils.download_from_url("https://drive.google.com/uc?id=1RILKwUdjjBBngB17JHwhZNBEaW4Mr-Ml", root=".") def get_text(text, hps): From 3b8ea149a9042ddda4bc09ca4f4194f7e669975a Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 14:39:30 -0400 Subject: [PATCH 06/18] torchtext --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7dd1b352..45241668 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ torch==1.6.0 torchvision==0.7.0 Unidecode==1.1.1 gradio +torchtext From 2d91da9425f13cc368d54922d406176b752edf12 Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 14:40:14 -0400 Subject: [PATCH 07/18] replace link --- gradiodemo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradiodemo.py b/gradiodemo.py index 34f31d7a..eed2579b 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -21,7 +21,7 @@ import numpy as np import torchtext -torchtext.utils.download_from_url("https://drive.google.com/uc?id=1RILKwUdjjBBngB17JHwhZNBEaW4Mr-Ml", root=".") +torchtext.utils.download_from_url("https://drive.google.com/uc?id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT", root=".") def get_text(text, hps): From 36e73b56cea2cf25f449f9b638004a98dfe071c1 Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 14:48:01 -0400 Subject: [PATCH 08/18] subproceess --- gradiodemo.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/gradiodemo.py b/gradiodemo.py index eed2579b..61a8c18e 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -21,6 +21,25 @@ import numpy as np import torchtext +import sys +from subprocess import call + +def run_cmd(command): + try: + print(command) + call(command, shell=True) + except KeyboardInterrupt: + print("Process interrupted") + sys.exit(1) + + + +run_cmd("cd monotonic_align") +run_cmd("sudo python setup.py build_ext --inplace") +run_cmd("sudo apt-get install espeak -y") +run_cmd("cd ..") + + torchtext.utils.download_from_url("https://drive.google.com/uc?id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT", root=".") From c2ab5d82fedbd1244855657f86fef0f6e1d6c201 Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 15:08:07 -0400 Subject: [PATCH 09/18] dir change --- gradiodemo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gradiodemo.py b/gradiodemo.py index 61a8c18e..e820192b 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -33,11 +33,12 @@ def run_cmd(command): sys.exit(1) - +os.chdir("./monotonic_align") run_cmd("cd monotonic_align") run_cmd("sudo python setup.py build_ext --inplace") run_cmd("sudo apt-get install espeak -y") -run_cmd("cd ..") +os.chdir("../vits") + torchtext.utils.download_from_url("https://drive.google.com/uc?id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT", root=".") From 206fe321e863b9d6335496438e93257ed5dedec8 Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 15:08:44 -0400 Subject: [PATCH 10/18] remove --- gradiodemo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gradiodemo.py b/gradiodemo.py index e820192b..6d145e30 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -34,7 +34,6 @@ def run_cmd(command): os.chdir("./monotonic_align") -run_cmd("cd monotonic_align") run_cmd("sudo python setup.py build_ext --inplace") run_cmd("sudo apt-get install espeak -y") os.chdir("../vits") From a46fa2dc7ab13b2600c1c4298d54efd71b77044b Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 15:38:34 -0400 Subject: [PATCH 11/18] change --- gradiodemo.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/gradiodemo.py b/gradiodemo.py index 6d145e30..1ddb5a15 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -11,15 +11,6 @@ import commons import utils from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate -from models import SynthesizerTrn -from text.symbols import symbols -from text import text_to_sequence - -from scipy.io.wavfile import write -import gradio as gr -import scipy.io.wavfile -import numpy as np -import torchtext import sys from subprocess import call @@ -36,7 +27,19 @@ def run_cmd(command): os.chdir("./monotonic_align") run_cmd("sudo python setup.py build_ext --inplace") run_cmd("sudo apt-get install espeak -y") -os.chdir("../vits") +os.chdir("..") + +from models import SynthesizerTrn +from text.symbols import symbols +from text import text_to_sequence + +from scipy.io.wavfile import write +import gradio as gr +import scipy.io.wavfile +import numpy as np +import torchtext + + From 63314da1ab8cf2494c52b02f14c8b9e9ef5babbe Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 16:39:24 -0400 Subject: [PATCH 12/18] dir chnage --- gradiodemo.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gradiodemo.py b/gradiodemo.py index 1ddb5a15..898ed777 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -22,9 +22,12 @@ def run_cmd(command): except KeyboardInterrupt: print("Process interrupted") sys.exit(1) - - -os.chdir("./monotonic_align") + +current = os.getcwd() +print(current) +full = current + "/monotonic_align" +print(full) +os.chdir(full) run_cmd("sudo python setup.py build_ext --inplace") run_cmd("sudo apt-get install espeak -y") os.chdir("..") From db48911c5a13eee4f1018bb1de780e3e1ff3558e Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 17:59:16 -0400 Subject: [PATCH 13/18] print --- gradiodemo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gradiodemo.py b/gradiodemo.py index 898ed777..b2ba690c 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -28,9 +28,11 @@ def run_cmd(command): full = current + "/monotonic_align" print(full) os.chdir(full) +print(os.getcwd()) run_cmd("sudo python setup.py build_ext --inplace") run_cmd("sudo apt-get install espeak -y") os.chdir("..") +print(os.getcwd()) from models import SynthesizerTrn from text.symbols import symbols From 2020ee3a6d0b34291ded148248cf372760cb60b3 Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 18:14:38 -0400 Subject: [PATCH 14/18] remove sudo --- gradiodemo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gradiodemo.py b/gradiodemo.py index b2ba690c..2aa6fee3 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -29,8 +29,8 @@ def run_cmd(command): print(full) os.chdir(full) print(os.getcwd()) -run_cmd("sudo python setup.py build_ext --inplace") -run_cmd("sudo apt-get install espeak -y") +run_cmd("python3 setup.py build_ext --inplace") +run_cmd("apt-get install espeak -y") os.chdir("..") print(os.getcwd()) From 5c88a50de3de0a3c3eb5a1b78d11c56060f89924 Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 18:54:00 -0400 Subject: [PATCH 15/18] upgrade --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 45241668..9c1a41b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,8 +5,8 @@ numpy==1.18.5 phonemizer==2.2.1 scipy==1.5.2 tensorboard==2.3.0 -torch==1.6.0 -torchvision==0.7.0 +torch +torchvision Unidecode==1.1.1 gradio torchtext From b28873598889e9db3369168cf246fda7214a1527 Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 19:08:14 -0400 Subject: [PATCH 16/18] add examples --- gradiodemo.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gradiodemo.py b/gradiodemo.py index 2aa6fee3..495b2b5c 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -82,7 +82,12 @@ def inference(text): title = "VITS" -description = "demo for VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. To use it, simply upload your text, or click one of the examples to load them. Read more at the links below." +description = "demo for VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. To use it, simply add your text, or click one of the examples to load them. Read more at the links below." article = "

Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech | Github Repo

" -gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch(debug=True) +examples = [ + ["We propose VITS, Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech."], + ["Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling."] +] + +gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch() From 9f47bc9f4700d46cffb1604e9497cf82b8b3082f Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 19:22:57 -0400 Subject: [PATCH 17/18] lines 5 --- gradiodemo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradiodemo.py b/gradiodemo.py index 495b2b5c..681f3065 100644 --- a/gradiodemo.py +++ b/gradiodemo.py @@ -77,7 +77,7 @@ def inference(text): return "./out.wav" -inputs = gr.inputs.Textbox(label="Input Text") +inputs = gr.inputs.Textbox(lines=5, label="Input Text") outputs = gr.outputs.File(label="Output Audio") From 1b6db7437cd5224baae9776139f890ca315fad8a Mon Sep 17 00:00:00 2001 From: AK391 <81195143+AK391@users.noreply.github.com> Date: Mon, 14 Jun 2021 19:23:56 -0400 Subject: [PATCH 18/18] link --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index f7883f8c..b1c7890c 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ Visit our [demo](https://jaywalnut310.github.io/vits-demo/index.html) for audio We also provide the [pretrained models](https://drive.google.com/drive/folders/1ksarh-cJf3F5eKJjLVWY0X1j1qsQqiS2?usp=sharing). +** Update note: [Gradio Web Demo](https://gradio.app/hub/AK391/vits) + ** Update note: Thanks to [Rishikesh (ऋषिकेश)](https://github.com/jaywalnut310/vits/issues/1), our interactive TTS demo is now available on [Colab Notebook](https://colab.research.google.com/drive/1CO61pZizDj7en71NQG_aqqKdGaA_SaBf?usp=sharing).