open-mmlab
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎bins/tts/inference.py
Lines changed: 2 additions & 0 deletions b/‎bins/tts/inference.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎bins/tts/train.py
Lines changed: 4 additions & 2 deletions b/‎bins/tts/train.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎config/jets.json
Lines changed: 120 additions & 0 deletions b/‎config/jets.json
Lines changed: 120 additions & 0 deletions
diff --git a/‎egs/tts/Jets/README.md
Lines changed: 138 additions & 0 deletions b/‎egs/tts/Jets/README.md
Lines changed: 138 additions & 0 deletions
diff --git a/‎egs/tts/Jets/exp_config.json
Lines changed: 32 additions & 0 deletions b/‎egs/tts/Jets/exp_config.json
Lines changed: 32 additions & 0 deletions
diff --git a/‎egs/tts/Jets/prepare_mfa.sh
Lines changed: 29 additions & 0 deletions b/‎egs/tts/Jets/prepare_mfa.sh
Lines changed: 29 additions & 0 deletions
@@ -44,6 +44,7 @@ In addition to the specific generation tasks, Amphion includes several **vocoder
     - [VITS](https://arxiv.org/abs/2106.06103): An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning
     - [VALL-E](https://arxiv.org/abs/2301.02111): A zero-shot TTS architecture that uses a neural codec language model with discrete codes.
     - [NaturalSpeech2](https://arxiv.org/abs/2304.09116): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices.
+    - [Jets](Jets): An end-to-end TTS model that jointly trains FastSpeech2 and HiFi-GAN with an alignment module.
 
 ### SVC: Singing Voice Conversion
 
 
@@ -11,6 +11,7 @@
 from models.tts.vits.vits_inference import VitsInference
 from models.tts.valle.valle_inference import VALLEInference
 from models.tts.naturalspeech2.ns2_inference import NS2Inference
+from models.tts.jets.jets_inference import JetsInference
 from utils.util import load_config
 import torch
 
@@ -21,6 +22,7 @@ def build_inference(args, cfg):
         "VITS": VitsInference,
         "VALLE": VALLEInference,
         "NaturalSpeech2": NS2Inference,
+        "Jets": JetsInference,
     }
 
     inference_class = supported_inference[cfg.model_type]
 
@@ -11,8 +11,9 @@
 from models.tts.vits.vits_trainer import VITSTrainer
 from models.tts.valle.valle_trainer import VALLETrainer
 from models.tts.naturalspeech2.ns2_trainer import NS2Trainer
-from models.tts.VALLE_V2.valle_ar_trainer import ValleARTrainer as VALLE_V2_AR
-from models.tts.VALLE_V2.valle_nar_trainer import ValleNARTrainer as VALLE_V2_NAR
+from models.tts.valle_v2.valle_ar_trainer import ValleARTrainer as VALLE_V2_AR
+from models.tts.valle_v2.valle_nar_trainer import ValleNARTrainer as VALLE_V2_NAR
+from models.tts.jets.jets_trainer import JetsTrainer
 
 from utils.util import load_config
 
@@ -25,6 +26,7 @@ def build_trainer(args, cfg):
         "NaturalSpeech2": NS2Trainer,
         "VALLE_V2_AR": VALLE_V2_AR,
         "VALLE_V2_NAR": VALLE_V2_NAR,
+        "Jets": JetsTrainer,
     }
 
     trainer_class = supported_trainer[cfg.model_type]
 
@@ -0,0 +1,120 @@
+{
+    "base_config": "config/tts.json",
+    "model_type": "Jets",
+    "task_type": "tts",
+    "dataset": ["LJSpeech"], 
+    "preprocess": {
+      // acoustic features
+      "extract_audio": true,
+      "extract_mel": true,
+      "mel_extract_mode": "taco",
+      "mel_min_max_norm": false,
+      "extract_pitch": true,
+      "extract_uv": false,
+      "pitch_extractor": "dio",
+      "extract_energy": true,
+      "energy_extract_mode": "from_tacotron_stft",
+      "extract_duration": true,
+      "use_phone": false,
+      "pitch_norm": true,
+      "energy_norm": true,
+      "pitch_remove_outlier": true,
+      "energy_remove_outlier": true,
+
+      // Default config 
+      "n_mel": 80,
+      "win_size": 1024,  // todo
+      "hop_size": 256,
+      "sample_rate": 22050,
+      "n_fft": 1024, // todo
+      "fmin": 0,
+      "fmax": 8000, // todo
+      "raw_data": "raw_data",
+      "text_cleaners": ["english_cleaners"],
+      "f0_min": 71,    // ~C2
+      "f0_max": 800, //1100,    // ~C6(1100), ~G5(800)
+      "pitch_bin": 256,
+      "pitch_max": 1100.0,
+      "pitch_min": 50.0,
+      "is_label": true,
+      "is_mu_law": true,
+      "bits": 8,
+
+      "mel_min_max_stats_dir": "mel_min_max_stats",
+      "whisper_dir": "whisper",
+      "content_vector_dir": "content_vector",
+      "wenet_dir": "wenet",
+      "mert_dir": "mert",
+      "spk2id":"spk2id.json",
+      "utt2spk":"utt2spk",
+      "valid_file": "test.json",
+
+      // Features used for model training
+      "use_mel": true,
+      "use_min_max_norm_mel": false,
+      "use_frame_pitch": true,
+      "use_frame_energy": true,
+      "use_phone_pitch": false,
+      "use_phone_energy": false,
+      "use_log_scale_pitch": false,
+      "use_log_scale_energy": false,
+      "use_spkid": false,
+      "align_mel_duration": true,
+      "text_cleaners": ["english_cleaners"],
+      "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
+      },
+    "model": {
+      // Settings for transformer
+      "transformer": {
+        "encoder_layer": 4,
+        "encoder_head": 2,
+        "encoder_hidden": 256,
+        "decoder_layer": 6,
+        "decoder_head": 2,
+        "decoder_hidden": 256,
+        "conv_filter_size": 1024,
+        "conv_kernel_size": [9, 1],
+        "encoder_dropout": 0.2,
+        "decoder_dropout": 0.2
+      },
+
+      // Settings for variance_predictor
+      "variance_predictor":{
+        "filter_size": 256,
+        "kernel_size": 3,
+        "dropout": 0.5
+      },
+    "variance_embedding":{
+        "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
+        "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
+        "n_bins": 256
+      },
+    "max_seq_len": 1000
+    },
+    "train":{
+      "batch_size": 16,
+      "max_epoch": 100,
+      "sort_sample": true,
+      "drop_last": true,
+      "group_size": 4,
+      "grad_clip_thresh": 1.0,
+      "dataloader": {
+        "num_worker": 8,
+        "pin_memory": true
+      },
+      "lr_scheduler":{
+        "num_warmup": 4000
+      },
+      // LR Scheduler
+      "scheduler": "NoamLR",
+      // Optimizer
+      "optimizer": "Adam",
+      "adam": {
+        "lr": 0.0625,
+        "betas": [0.9, 0.98],
+        "eps": 0.000000001,
+        "weight_decay": 0.0
+      },
+    }
+
+}
@@ -0,0 +1,138 @@
+# Jets Recipe
+
+In this recipe, we will show how to train [Jets](https://arxiv.org/abs/2203.16852) using Amphion's infrastructure. Jets is an end-to-end text-to-speech (E2E-TTS) model which jointly trains FastSpeech2 and HiFi-GAN.
+
+There are four stages in total:
+
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference
+
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+>
+> ```bash
+> cd Amphion
+> ```
+
+## 1. Data Preparation
+
+### Dataset Download
+
+You can use LJSpeech to train TTS model. How to download dataset is detailed [here](../../datasets/README.md).
+
+### Configuration
+
+After downloading the dataset, you can set the dataset paths in  `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+
+```json
+    "dataset": [
+        "LJSpeech",
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "LJSpeech": "[LJSpeech dataset path]",
+    },
+```
+
+## 2. Features Extraction
+
+### Configuration
+
+Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
+
+```json
+    // TODO: Fill in the output log path
+    "log_dir": "ckpts/tts",
+    "preprocess": {
+        // TODO: Fill in the output data path
+        "processed_dir": "data",
+        ...
+    },
+```
+
+### Run
+
+Run the `run.sh` as the preproces stage (set  `--stage 1`):
+
+```bash
+sh egs/tts/Jets/run.sh --stage 1
+```
+
+## 3. Training
+
+### Configuration
+
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
+
+```
+"train": {
+        "batch_size": 16,
+    }
+```
+
+### Run
+
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `ckpts/tts/[YourExptName]`.
+
+```bash
+sh egs/tts/Jets/run.sh --stage 2 --name [YourExptName]
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. We recommend you to only use one GPU for training.
+
+## 4. Inference
+
+### Configuration
+
+For inference, you need to specify the following configurations when running `run.sh`:
+
+| Parameters              | Description                                                        | Example                                                                                                   |
+| ----------------------- | ------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------- |
+| `--infer_expt_dir`    | The experimental directory which contains `checkpoint`           | `ckpts/tts/[YourExptName]`                                                                              |
+| `--infer_output_dir`  | The output directory to save inferred audios.                      | `ckpts/tts/[YourExptName]/result`                                                                       |
+| `--infer_mode`        | The inference mode, e.g., "`batch`".                             | `batch`" to generate a batch of speech at a time.                                                       |
+| `--infer_dataset`     | The dataset used for inference.                                    | For LJSpeech dataset, the inference dataset would be `LJSpeech`.                                        |
+| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction |
+
+### Run
+
+For example, if you want to generate speech of all testing set split from LJSpeech, just run:
+
+```bash
+sh egs/tts/Jets/run.sh --stage 3 \
+    --infer_expt_dir ckpts/tts/[YourExptName] \
+    --infer_output_dir ckpts/tts/[YourExptName]/result \
+    --infer_mode "batch" \
+    --infer_dataset "LJSpeech" \
+    --infer_testing_set "test"
+```
+
+### ISSUES and Solutions
+
+```
+NotImplementedError: Using RTX 3090 or 4000 series doesn't support faster communication broadband via P2P or IB. Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which will do this automatically.
+2024-02-24 10:57:49 | INFO | torch.distributed.distributed_c10d | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
+```
+
+The error message is related to an incompatibility issue with the NVIDIA RTX 3090 or 4000 series GPUs when trying to use peer-to-peer (P2P) communication or InfiniBand (IB) for faster communication. This incompatibility arises within the PyTorch accelerate library, which facilitates distributed training and inference.
+
+To fix this issue, before running your script, you can set the environment variables in your terminal:
+
+```
+export NCCL_P2P_DISABLE=1
+export NCCL_IB_DISABLE=1
+```
+
+### Noted
+
+Extensive logging messages related to `torch._subclasses.fake_tensor` and `torch._dynamo.output_graph` may be observed during inference. Despite attempts to ignore these logs, no effective solution has been found. However, it does not impact the inference process.
+
+```bibtex
+@article{lim2022jets,
+  title={JETS: Jointly training FastSpeech2 and HiFi-GAN for end to end text to speech},
+  author={Lim, Dan and Jung, Sunghee and Kim, Eesung},
+  journal={arXiv preprint arXiv:2203.16852},
+  year={2022}
+}
+```
@@ -0,0 +1,32 @@
+{
+  "base_config": "config/jets.json",
+  "model_type": "Jets",
+  "dataset": [
+    "LJSpeech"
+  ],
+  "dataset_path": {
+    "LJSpeech": "../LJSpeech-1.1"
+  },
+  "log_dir": "ckpts/tts",
+  "preprocess": {
+    "processed_dir": "data",
+    "sample_rate": 22050,
+    "use_audios": true,
+    "extract_audio": true,
+},
+  "train": {
+    "batch_size": 16,
+    "max_epoch": 100,
+    "learning_rate": 2e-4,
+    "AdamW": {
+      "betas": [
+          0.8,
+          0.99
+      ],
+      "eps": 1e-9,
+    },
+    "lr_decay": 0.999875,
+    "segment_size": 64,
+    "upsample_factor": 256
+  }
+}
@@ -0,0 +1,29 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/bin/bash
+
+# Navigate to the 'pretrained' directory
+cd pretrained || { echo "Failed to change directory to 'pretrained'"; exit 1; }
+
+# Create and navigate to the 'mfa' directory
+mkdir -p mfa && cd mfa || { echo "Failed to create or change directory to 'mfa'"; exit 1; }
+
+# Define the MFA file URL and the file name
+mfa_url="https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.1.0-beta.2/montreal-forced-aligner_linux.tar.gz"
+mfa_file="montreal-forced-aligner_linux.tar.gz"
+
+# Download MFA if it doesn't exist
+if [ ! -f "$mfa_file" ]; then
+    wget "$mfa_url" || { echo "Failed to download MFA"; exit 1; }
+fi
+
+# Extract MFA
+tar -zxvf "$mfa_file" || { echo "Failed to extract MFA"; exit 1; }
+
+# Optionally, remove the tar.gz file after extraction
+rm "$mfa_file"
+
+echo "MFA setup completed successfully."