Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gradio web demo #4

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ Visit our [demo](https://jaywalnut310.github.io/vits-demo/index.html) for audio

We also provide the [pretrained models](https://drive.google.com/drive/folders/1ksarh-cJf3F5eKJjLVWY0X1j1qsQqiS2?usp=sharing).

** Update note: [Gradio Web Demo](https://gradio.app/hub/AK391/vits)

** Update note: Thanks to [Rishikesh (ऋषिकेश)](https://github.com/jaywalnut310/vits/issues/1), our interactive TTS demo is now available on [Colab Notebook](https://colab.research.google.com/drive/1CO61pZizDj7en71NQG_aqqKdGaA_SaBf?usp=sharing).

<table style="width:100%">
Expand Down
93 changes: 93 additions & 0 deletions gradiodemo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import matplotlib.pyplot as plt

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate

import sys
from subprocess import call

def run_cmd(command):
try:
print(command)
call(command, shell=True)
except KeyboardInterrupt:
print("Process interrupted")
sys.exit(1)

current = os.getcwd()
print(current)
full = current + "/monotonic_align"
print(full)
os.chdir(full)
print(os.getcwd())
run_cmd("python3 setup.py build_ext --inplace")
run_cmd("apt-get install espeak -y")
os.chdir("..")
print(os.getcwd())

from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write
import gradio as gr
import scipy.io.wavfile
import numpy as np
import torchtext





torchtext.utils.download_from_url("https://drive.google.com/uc?id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT", root=".")


def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm

hps = utils.get_hparams_from_file("./configs/ljs_base.json")
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("pretrained_ljs.pth", net_g, None)
def inference(text):
stn_tst = get_text(text, hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy()
scipy.io.wavfile.write("out.wav", hps.data.sampling_rate, audio)
return "./out.wav"


inputs = gr.inputs.Textbox(lines=5, label="Input Text")
outputs = gr.outputs.File(label="Output Audio")


title = "VITS"
description = "demo for VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.06103'>Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech</a> | <a href='https://github.com/jaywalnut310/vits'>Github Repo</a></p>"

examples = [
["We propose VITS, Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech."],
["Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling."]
]

gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ numpy==1.18.5
phonemizer==2.2.1
scipy==1.5.2
tensorboard==2.3.0
torch==1.6.0
torchvision==0.7.0
torch
torchvision
Unidecode==1.1.1
gradio
torchtext