Skip to content

Commit 72112a6

Browse files
Add Jets implementation (#231)
1 parent efcdf4b commit 72112a6

File tree

20 files changed

+3205
-3
lines changed

20 files changed

+3205
-3
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ In addition to the specific generation tasks, Amphion includes several **vocoder
4444
- [VITS](https://arxiv.org/abs/2106.06103): An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning
4545
- [VALL-E](https://arxiv.org/abs/2301.02111): A zero-shot TTS architecture that uses a neural codec language model with discrete codes.
4646
- [NaturalSpeech2](https://arxiv.org/abs/2304.09116): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices.
47+
- [Jets](Jets): An end-to-end TTS model that jointly trains FastSpeech2 and HiFi-GAN with an alignment module.
4748

4849
### SVC: Singing Voice Conversion
4950

bins/tts/inference.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from models.tts.vits.vits_inference import VitsInference
1212
from models.tts.valle.valle_inference import VALLEInference
1313
from models.tts.naturalspeech2.ns2_inference import NS2Inference
14+
from models.tts.jets.jets_inference import JetsInference
1415
from utils.util import load_config
1516
import torch
1617

@@ -21,6 +22,7 @@ def build_inference(args, cfg):
2122
"VITS": VitsInference,
2223
"VALLE": VALLEInference,
2324
"NaturalSpeech2": NS2Inference,
25+
"Jets": JetsInference,
2426
}
2527

2628
inference_class = supported_inference[cfg.model_type]

bins/tts/train.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
from models.tts.vits.vits_trainer import VITSTrainer
1212
from models.tts.valle.valle_trainer import VALLETrainer
1313
from models.tts.naturalspeech2.ns2_trainer import NS2Trainer
14-
from models.tts.VALLE_V2.valle_ar_trainer import ValleARTrainer as VALLE_V2_AR
15-
from models.tts.VALLE_V2.valle_nar_trainer import ValleNARTrainer as VALLE_V2_NAR
14+
from models.tts.valle_v2.valle_ar_trainer import ValleARTrainer as VALLE_V2_AR
15+
from models.tts.valle_v2.valle_nar_trainer import ValleNARTrainer as VALLE_V2_NAR
16+
from models.tts.jets.jets_trainer import JetsTrainer
1617

1718
from utils.util import load_config
1819

@@ -25,6 +26,7 @@ def build_trainer(args, cfg):
2526
"NaturalSpeech2": NS2Trainer,
2627
"VALLE_V2_AR": VALLE_V2_AR,
2728
"VALLE_V2_NAR": VALLE_V2_NAR,
29+
"Jets": JetsTrainer,
2830
}
2931

3032
trainer_class = supported_trainer[cfg.model_type]

config/jets.json

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
{
2+
"base_config": "config/tts.json",
3+
"model_type": "Jets",
4+
"task_type": "tts",
5+
"dataset": ["LJSpeech"],
6+
"preprocess": {
7+
// acoustic features
8+
"extract_audio": true,
9+
"extract_mel": true,
10+
"mel_extract_mode": "taco",
11+
"mel_min_max_norm": false,
12+
"extract_pitch": true,
13+
"extract_uv": false,
14+
"pitch_extractor": "dio",
15+
"extract_energy": true,
16+
"energy_extract_mode": "from_tacotron_stft",
17+
"extract_duration": true,
18+
"use_phone": false,
19+
"pitch_norm": true,
20+
"energy_norm": true,
21+
"pitch_remove_outlier": true,
22+
"energy_remove_outlier": true,
23+
24+
// Default config
25+
"n_mel": 80,
26+
"win_size": 1024, // todo
27+
"hop_size": 256,
28+
"sample_rate": 22050,
29+
"n_fft": 1024, // todo
30+
"fmin": 0,
31+
"fmax": 8000, // todo
32+
"raw_data": "raw_data",
33+
"text_cleaners": ["english_cleaners"],
34+
"f0_min": 71, // ~C2
35+
"f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36+
"pitch_bin": 256,
37+
"pitch_max": 1100.0,
38+
"pitch_min": 50.0,
39+
"is_label": true,
40+
"is_mu_law": true,
41+
"bits": 8,
42+
43+
"mel_min_max_stats_dir": "mel_min_max_stats",
44+
"whisper_dir": "whisper",
45+
"content_vector_dir": "content_vector",
46+
"wenet_dir": "wenet",
47+
"mert_dir": "mert",
48+
"spk2id":"spk2id.json",
49+
"utt2spk":"utt2spk",
50+
"valid_file": "test.json",
51+
52+
// Features used for model training
53+
"use_mel": true,
54+
"use_min_max_norm_mel": false,
55+
"use_frame_pitch": true,
56+
"use_frame_energy": true,
57+
"use_phone_pitch": false,
58+
"use_phone_energy": false,
59+
"use_log_scale_pitch": false,
60+
"use_log_scale_energy": false,
61+
"use_spkid": false,
62+
"align_mel_duration": true,
63+
"text_cleaners": ["english_cleaners"],
64+
"phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
65+
},
66+
"model": {
67+
// Settings for transformer
68+
"transformer": {
69+
"encoder_layer": 4,
70+
"encoder_head": 2,
71+
"encoder_hidden": 256,
72+
"decoder_layer": 6,
73+
"decoder_head": 2,
74+
"decoder_hidden": 256,
75+
"conv_filter_size": 1024,
76+
"conv_kernel_size": [9, 1],
77+
"encoder_dropout": 0.2,
78+
"decoder_dropout": 0.2
79+
},
80+
81+
// Settings for variance_predictor
82+
"variance_predictor":{
83+
"filter_size": 256,
84+
"kernel_size": 3,
85+
"dropout": 0.5
86+
},
87+
"variance_embedding":{
88+
"pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
89+
"energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
90+
"n_bins": 256
91+
},
92+
"max_seq_len": 1000
93+
},
94+
"train":{
95+
"batch_size": 16,
96+
"max_epoch": 100,
97+
"sort_sample": true,
98+
"drop_last": true,
99+
"group_size": 4,
100+
"grad_clip_thresh": 1.0,
101+
"dataloader": {
102+
"num_worker": 8,
103+
"pin_memory": true
104+
},
105+
"lr_scheduler":{
106+
"num_warmup": 4000
107+
},
108+
// LR Scheduler
109+
"scheduler": "NoamLR",
110+
// Optimizer
111+
"optimizer": "Adam",
112+
"adam": {
113+
"lr": 0.0625,
114+
"betas": [0.9, 0.98],
115+
"eps": 0.000000001,
116+
"weight_decay": 0.0
117+
},
118+
}
119+
120+
}

egs/tts/Jets/README.md

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# Jets Recipe
2+
3+
In this recipe, we will show how to train [Jets](https://arxiv.org/abs/2203.16852) using Amphion's infrastructure. Jets is an end-to-end text-to-speech (E2E-TTS) model which jointly trains FastSpeech2 and HiFi-GAN.
4+
5+
There are four stages in total:
6+
7+
1. Data preparation
8+
2. Features extraction
9+
3. Training
10+
4. Inference
11+
12+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
13+
>
14+
> ```bash
15+
> cd Amphion
16+
> ```
17+
18+
## 1. Data Preparation
19+
20+
### Dataset Download
21+
22+
You can use LJSpeech to train TTS model. How to download dataset is detailed [here](../../datasets/README.md).
23+
24+
### Configuration
25+
26+
After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
27+
28+
```json
29+
"dataset": [
30+
"LJSpeech",
31+
],
32+
"dataset_path": {
33+
// TODO: Fill in your dataset path
34+
"LJSpeech": "[LJSpeech dataset path]",
35+
},
36+
```
37+
38+
## 2. Features Extraction
39+
40+
### Configuration
41+
42+
Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
43+
44+
```json
45+
// TODO: Fill in the output log path
46+
"log_dir": "ckpts/tts",
47+
"preprocess": {
48+
// TODO: Fill in the output data path
49+
"processed_dir": "data",
50+
...
51+
},
52+
```
53+
54+
### Run
55+
56+
Run the `run.sh` as the preproces stage (set `--stage 1`):
57+
58+
```bash
59+
sh egs/tts/Jets/run.sh --stage 1
60+
```
61+
62+
## 3. Training
63+
64+
### Configuration
65+
66+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
67+
68+
```
69+
"train": {
70+
"batch_size": 16,
71+
}
72+
```
73+
74+
### Run
75+
76+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `ckpts/tts/[YourExptName]`.
77+
78+
```bash
79+
sh egs/tts/Jets/run.sh --stage 2 --name [YourExptName]
80+
```
81+
82+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. We recommend you to only use one GPU for training.
83+
84+
## 4. Inference
85+
86+
### Configuration
87+
88+
For inference, you need to specify the following configurations when running `run.sh`:
89+
90+
| Parameters | Description | Example |
91+
| ----------------------- | ------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------- |
92+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `ckpts/tts/[YourExptName]` |
93+
| `--infer_output_dir` | The output directory to save inferred audios. | `ckpts/tts/[YourExptName]/result` |
94+
| `--infer_mode` | The inference mode, e.g., "`batch`". | `batch`" to generate a batch of speech at a time. |
95+
| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. |
96+
| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction |
97+
98+
### Run
99+
100+
For example, if you want to generate speech of all testing set split from LJSpeech, just run:
101+
102+
```bash
103+
sh egs/tts/Jets/run.sh --stage 3 \
104+
--infer_expt_dir ckpts/tts/[YourExptName] \
105+
--infer_output_dir ckpts/tts/[YourExptName]/result \
106+
--infer_mode "batch" \
107+
--infer_dataset "LJSpeech" \
108+
--infer_testing_set "test"
109+
```
110+
111+
### ISSUES and Solutions
112+
113+
```
114+
NotImplementedError: Using RTX 3090 or 4000 series doesn't support faster communication broadband via P2P or IB. Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which will do this automatically.
115+
2024-02-24 10:57:49 | INFO | torch.distributed.distributed_c10d | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
116+
```
117+
118+
The error message is related to an incompatibility issue with the NVIDIA RTX 3090 or 4000 series GPUs when trying to use peer-to-peer (P2P) communication or InfiniBand (IB) for faster communication. This incompatibility arises within the PyTorch accelerate library, which facilitates distributed training and inference.
119+
120+
To fix this issue, before running your script, you can set the environment variables in your terminal:
121+
122+
```
123+
export NCCL_P2P_DISABLE=1
124+
export NCCL_IB_DISABLE=1
125+
```
126+
127+
### Noted
128+
129+
Extensive logging messages related to `torch._subclasses.fake_tensor` and `torch._dynamo.output_graph` may be observed during inference. Despite attempts to ignore these logs, no effective solution has been found. However, it does not impact the inference process.
130+
131+
```bibtex
132+
@article{lim2022jets,
133+
title={JETS: Jointly training FastSpeech2 and HiFi-GAN for end to end text to speech},
134+
author={Lim, Dan and Jung, Sunghee and Kim, Eesung},
135+
journal={arXiv preprint arXiv:2203.16852},
136+
year={2022}
137+
}
138+
```

egs/tts/Jets/exp_config.json

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"base_config": "config/jets.json",
3+
"model_type": "Jets",
4+
"dataset": [
5+
"LJSpeech"
6+
],
7+
"dataset_path": {
8+
"LJSpeech": "../LJSpeech-1.1"
9+
},
10+
"log_dir": "ckpts/tts",
11+
"preprocess": {
12+
"processed_dir": "data",
13+
"sample_rate": 22050,
14+
"use_audios": true,
15+
"extract_audio": true,
16+
},
17+
"train": {
18+
"batch_size": 16,
19+
"max_epoch": 100,
20+
"learning_rate": 2e-4,
21+
"AdamW": {
22+
"betas": [
23+
0.8,
24+
0.99
25+
],
26+
"eps": 1e-9,
27+
},
28+
"lr_decay": 0.999875,
29+
"segment_size": 64,
30+
"upsample_factor": 256
31+
}
32+
}

egs/tts/Jets/prepare_mfa.sh

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Copyright (c) 2023 Amphion.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
#!/bin/bash
7+
8+
# Navigate to the 'pretrained' directory
9+
cd pretrained || { echo "Failed to change directory to 'pretrained'"; exit 1; }
10+
11+
# Create and navigate to the 'mfa' directory
12+
mkdir -p mfa && cd mfa || { echo "Failed to create or change directory to 'mfa'"; exit 1; }
13+
14+
# Define the MFA file URL and the file name
15+
mfa_url="https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.1.0-beta.2/montreal-forced-aligner_linux.tar.gz"
16+
mfa_file="montreal-forced-aligner_linux.tar.gz"
17+
18+
# Download MFA if it doesn't exist
19+
if [ ! -f "$mfa_file" ]; then
20+
wget "$mfa_url" || { echo "Failed to download MFA"; exit 1; }
21+
fi
22+
23+
# Extract MFA
24+
tar -zxvf "$mfa_file" || { echo "Failed to extract MFA"; exit 1; }
25+
26+
# Optionally, remove the tar.gz file after extraction
27+
rm "$mfa_file"
28+
29+
echo "MFA setup completed successfully."

0 commit comments

Comments
 (0)