open-mmlab · Nugine · May 7, 2024 · May 7, 2024 · May 7, 2024 · May 8, 2024
diff --git a/config/freevc.json b/config/freevc.json
@@ -0,0 +1,96 @@
+{
+    "preprocess": {
+        "vctk_dir": "./data/VCTK",
+        "vctk_16k_dir": "./data/vctk-16k",
+        "vctk_22k_dir": "./data/vctk-22k",
+        "split_dir": "./data/split",
+        "spk_dir": "./data/spk",
+        "ssl_dir": "./data/ssl",
+        "sr_dir": "./data/sr",
+        "hifigan_ckpt_path": "./ckpts/hifigan-vctk-v1",
+        "minh": 68,
+        "maxh": 92
+    },
+    "data": {
+        "sampling_rate": 16000,
+        "filter_length": 1280,
+        "hop_length": 320,
+        "win_length": 1280,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "train": {
+        "log_interval": 200,
+        "eval_interval": 10000,
+        "seed": 1234,
+        "epochs": 10000,
+        "learning_rate": 2e-4,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-9,
+        "batch_size": 64,
+        "num_workers": 16,
+        "fp16_run": false,
+        "lr_decay": 0.999875,
+        "segment_size": 8960,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0,
+        "use_sr": true,
+        "max_speclen": 128
+    },
+    "model": {
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0.1,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            10,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "n_layers_q": 3,
+        "use_spectral_norm": false,
+        "gin_channels": 256,
+        "ssl_dim": 1024,
+        "use_spk": true
+    }
+}
diff --git a/egs/vc/FreeVC/README.md b/egs/vc/FreeVC/README.md
@@ -0,0 +1,145 @@
+# FreeVC
+
+This is an implementation of [FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion](https://arxiv.org/abs/2210.15418). Adapted from end-to-end framework of [VITS](https://arxiv.org/abs/2106.06103) for high-quality waveform reconstruction, and propose strategies for clean content information extraction without text annotation. It disentangle content information by imposing an information bottleneck to [WavLM](https://arxiv.org/abs/2110.13900) features, utilize the **spectrogram-resize** based data augmentation to improve the purity of extracted content information.
+
+There are four stages in total:
+
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference/conversion
+
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+>
+> ```bash
+> cd Amphion
+> ```
+
+## 1. Data Preparation
+
+### Dataset Download
+
+For other experiments, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
+
+In this experiment, we only utilize two datasets: VTCK and LibriTTS
+
+### Configuration
+
+Specify the dataset path in  `exp_config.json`.
+
+```json
+    "preprocess": {
+        "vctk_dir": "[VCTK dataset path]",
+        // ...
+    }
+```
+
+## 2. Features Extraction
+
+### Pretrained Models Download
+
+You should download pretrained HiFi-GAN (VCTK_V1) from [its repo](https://github.com/jik876/hifi-gan) according to the original paper.
+
+The code will automatically download pretrained [WavLM-Large](https://huggingface.co/microsoft/wavlm-large) model from Huggingface. You can also download it in advance:
+
+```bash
+huggingface-cli download microsoft/wavlm-large
+```
+
+The pretrained speaker encoder is available at: <https://github.com/liusongxiang/ppg-vc/tree/main/speaker_encoder/ckpt>
+
+The weight should be put in `models/vc/FreeVC/speaker_encoder/ckpt/` since it is excluded from the git history.
+
+### Configuration
+
+Specify the data path and the checkpoint path for saving the processed data in `exp_config.json`:
+
+```json
+    "preprocess": {
+        // ...
+        "vctk_16k_dir": "[preprocessed VCTK 16k directory]",
+        "vctk_22k_dir": "[preprocessed VCTK 22k directory]",
+        "spk_dir": "[preprocess_spk directory]",
+        "ssl_dir": "[preprocess_ssl directory]",
+        "sr_dir": "[preprocess_sr directory]", 
+        "hifigan_ckpt_path": "[hifigan checkpoint file path]"
+        // ...
+    },
+```
+
+Note that the preprocessed data will take about 600GB disk space.
+
+### Run
+
+Run the `run.sh` as the preproces stage (set  `--stage 1`).
+
+```bash
+sh egs/vc/FreeVC/run.sh --stage 1 -c egs/vc/FreeVC/exp_config.json
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+
+## 3. Training
+
+### Configuration
+
+We provide the default hyparameters in the `config/freevc.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+
+```json
+"model": {
+    "use_spk": true
+    // ...
+},
+"train": {
+    "use_sr": true,
+    // ...
+    "batch_size": 32,
+    // ...
+    "learning_rate": 2.0e-4
+    // ...
+}
+```
+
+### Run
+
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vc/FreeVC/[YourExptName]`.
+
+```bash
+sh egs/vc/FreeVC/run.sh --stage 2 -c egs/vc/FreeVC/exp_config.json --name [YourExptName]
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+
+## 4. Inference/Conversion
+
+### Run
+
+For inference/conversion, you need to first create a file `convert.txt` indicating the source audio, target audio and the name of the output audio in following format:
+
+```
+# format
+[name of the output]|[path/to/the/source/audio]|[path/to/the/target/audio]
+
+# an example(each reconstruction written in a line)
+title1|data/vctk-16k/p225/p225_001.wav|data/vctk-16k/p226/p226_002.wav
+```
+
+
+Then you should run `run.sh`,  you need to specify the following configurations:
+
+| Parameters        | Description                                                  | Example                                                      |
+| ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| `--config`        | The base configuration                                       | `[Your path to the base configuration]`                      |
+| `--ckpt`          | The experimental directory which contains `checkpoint`       | `[Your path to save logs and checkpoints]/[YourExptName]`    |
+| `--convert`       | The convert.txt path which contains the audios to be reconstructed | `[Your path to save convert.txt]`                            |
+| `--outdir`        | The output directory to save inferred audios.                | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
+
+For example:
+
+```bash
+sh egs/vc/FreeVC/run.sh --stage 3 \
+	--config egs/vc/FreeVC/exp_config.json \
+	--ckpt ckpts/vc/FreeVC/[YourExptName]/G_100000.ckpt \
+	--convert ckpts/vc/FreeVC/[YourExptName] \
+	--outdir ckpts/vc/FreeVC/[YourExptName]/result \
+```
diff --git a/egs/vc/FreeVC/exp_config.json b/egs/vc/FreeVC/exp_config.json
@@ -0,0 +1,18 @@
+{
+    "base_config": "config/freevc.json",
+    "preprocess": {
+        "vctk_dir": "[VCTK dataset path]",
+        "vctk_16k_dir": "[preprocessed VCTK 16k directory]",
+        "vctk_22k_dir": "[preprocessed VCTK 22k directory]",
+        "spk_dir": "[preprocess_spk directory]",
+        "ssl_dir": "[preprocess_ssl directory]",
+        "sr_dir": "[preprocess_sr directory]",
+        "hifigan_ckpt_path": "[hifigan checkpoint file path]"
+    },
+    "model": {
+        "use_spk": true
+    },
+    "train": {
+        "use_sr": true
+    }
+}
diff --git a/egs/vc/FreeVC/run.sh b/egs/vc/FreeVC/run.sh
@@ -0,0 +1,113 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+
+######## Parse the Given Parameters from the Command ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,ckpt:,convert:,outdir: -- "$@")
+eval set -- "$options"
+
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+
+    # [Only for Inference] The path of saved checkpoint.
+    --ckpt) shift; ckpt=$1 ; shift ;;
+    # [Only for Inference] The path of convert file
+    --convert) shift; convert=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios.
+    --outdir) shift; outdir=$1 ; shift ;;
+    # [Only for Inference] Whether to use timestamp in the output filename.
+    --use_timestamp) shift; use_timestamp=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+
+
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/models/vc/FreeVC/preprocess.py \
+        --config $exp_config
+fi
+
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+
+    # add default value
+    if [ -z "$resume_from_ckpt_path" ]; then
+        resume_from_ckpt_path=""
+    fi
+
+    if [ -z "$resume_type" ]; then
+        resume_type="resume"
+    fi
+
+    if [ "$resume" = true ]; then
+        echo "Resume from the existing experiment..."
+        CUDA_VISIBLE_DEVICES="$gpu" python "${work_dir}"/models/vc/FreeVC/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume \
+            --resume_from_ckpt_path "$resume_from_ckpt_path" \
+            --resume_type "$resume_type"
+    else
+        echo "Start a new experiment..."
+        CUDA_VISIBLE_DEVICES="$gpu" python "${work_dir}"/models/vc/FreeVC/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info
+    fi
+fi
+
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/models/vc/FreeVC/inference.py \
+        --config $exp_config \
+        --ckpt $ckpt \
+        --convert $convert \
+        --outdir $outdir
+fi