From 30a6b8911dd8b2f7bb467799eaa140ed8f88386b Mon Sep 17 00:00:00 2001 From: MingHui-Fang <1546415953@qq.com> Date: Fri, 14 Jun 2024 22:21:43 +0800 Subject: [PATCH 1/2] Fix some errors --- helpers/model_init_scripts/init_dummy_model.py | 6 +++++- helpers/model_init_scripts/init_dummy_model_with_encodec.py | 6 +++++- training/run_parler_tts_training.py | 5 +++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/helpers/model_init_scripts/init_dummy_model.py b/helpers/model_init_scripts/init_dummy_model.py index 25f18e8..0919073 100644 --- a/helpers/model_init_scripts/init_dummy_model.py +++ b/helpers/model_init_scripts/init_dummy_model.py @@ -57,7 +57,11 @@ model.generation_config.eos_token_id = encodec_vocab_size # set other default generation config params - model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) + ''' + In DAC, the 'model.audio_encoder.config.frame_rate' is 86 + 'model.generation_config.max_length' should be smaller than 'max_position_embeddings' in the decoder. + ''' + model.generation_config.max_length = int(20 * model.audio_encoder.config.frame_rate) model.generation_config.do_sample = True # True model.generation_config.guidance_scale = 1 # 3.0 diff --git a/helpers/model_init_scripts/init_dummy_model_with_encodec.py b/helpers/model_init_scripts/init_dummy_model_with_encodec.py index 32242b4..0cf0baa 100644 --- a/helpers/model_init_scripts/init_dummy_model_with_encodec.py +++ b/helpers/model_init_scripts/init_dummy_model_with_encodec.py @@ -54,7 +54,11 @@ model.generation_config.eos_token_id = encodec_vocab_size # set other default generation config params - model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) + ''' + In encodec, the 'model.audio_encoder.config.frame_rate' is 75 + 'model.generation_config.max_length' should be smaller than 'max_position_embeddings' in the decoder. + ''' + model.generation_config.max_length = int(25 * model.audio_encoder.config.frame_rate) model.generation_config.do_sample = True # True model.generation_config.guidance_scale = 1 # 3.0 diff --git a/training/run_parler_tts_training.py b/training/run_parler_tts_training.py index 22e091f..89bf577 100644 --- a/training/run_parler_tts_training.py +++ b/training/run_parler_tts_training.py @@ -43,7 +43,7 @@ from accelerate import Accelerator -from accelerate.utils import set_seed, AutocastKwargs, InitProcessGroupKwargs, TorchDynamoPlugin +from accelerate.utils import set_seed, AutocastKwargs, InitProcessGroupKwargs, TorchDynamoPlugin, DistributedDataParallelKwargs from accelerate.utils.memory import release_memory from parler_tts import ( @@ -97,7 +97,8 @@ def main(): padding = "max_length" if data_args.pad_to_max_length else "longest" ####### A. Preparation - kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=60))] + kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=60)), + DistributedDataParallelKwargs(find_unused_parameters=True)] accelerator = Accelerator( gradient_accumulation_steps=training_args.gradient_accumulation_steps, From d29b2c1b39895dda04c087b79385d80776a2bc9b Mon Sep 17 00:00:00 2001 From: MingHui-Fang <1546415953@qq.com> Date: Tue, 18 Jun 2024 00:50:33 +0800 Subject: [PATCH 2/2] Fix bugs --- training/run_parler_tts_training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/run_parler_tts_training.py b/training/run_parler_tts_training.py index 89bf577..726dbf8 100644 --- a/training/run_parler_tts_training.py +++ b/training/run_parler_tts_training.py @@ -433,8 +433,8 @@ def apply_audio_decoder(batch): if accelerator.is_main_process: lab = generate_labels["labels"].cpu().transpose(1, 2).to(torch.int16) - rat = generate_labels["ratio"].cpu().squeeze() - lens = generate_labels["len_audio"].cpu().squeeze() + rat = generate_labels["ratio"].cpu().reshape(-1) + lens = generate_labels["len_audio"].cpu().reshape(-1) lab = [l[:, : int(ratio * length)] for (l, ratio, length) in zip(lab, rat, lens)] all_generated_labels.extend(lab)