-
Notifications
You must be signed in to change notification settings - Fork 61
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Question about zero-shot TTS #165
Comments
|
def warmstart(checkpoint_path, model, strict=False):
pretrained_dict = torch.load(checkpoint_path, map_location="cpu")
pretrained_dict = pretrained_dict["state_dict"]
is_module = False
if list(pretrained_dict.keys())[0].startswith("module."):
is_module = True
if is_module:
new_state_dict = OrderedDict()
for k, v in pretrained_dict.items():
name = k[7:] # remove `module.`
new_state_dict[name] = v
pretrained_dict = new_state_dict
model_dict = model.state_dict()
model_dict.update(pretrained_dict)
model.load_state_dict(model_dict, strict=strict)
print(f"Warm started from {checkpoint_path} is module {is_module}")
model.eval()
return model
def parse_args(args):
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="Path to JSON config")
parser.add_argument('-o', "--output_dir", default="results")
parser.add_argument('--text', "--text", default="hello world")
args = parser.parse_args(args)
return args
if __name__ == "__main__":
args = parse_args(sys.argv[1:])
if args.config:
with open(args.config) as f:
config = json.load(f)
else:
print(f"!!!no config")
exit(-1)
model_config = config["model_config"]
model = RADTTS(**model_config)
pred_config = config["pred_config"]
model = warmstart(pred_config["warmstart_checkpoint_path"] , model)
#vocoder
vocoder = get_vocoder(
hifi_gan_config_path = pred_config["vocoder_config_path"],
hifi_gan_checkpoint_path = pred_config["vocoder_checkpoint_path"],
)
ignore_keys = ["training_files", "validation_files"]
print("initializing training dataloader")
data_config = config["data_config"]
dataset = Data(
data_config["training_files"],
**dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
)
text = dataset.get_text(args.text).unsqueeze(0)
print(f"type(text)={type(text)} text.shape={text.shape}")
speaker_id = torch.LongTensor([0])
model_output = model.infer(speaker_id, text, sigma=0.8)
mels = model_output["mel"]
if hasattr(vocoder, "forward"):
audio = vocoder(mels.cpu()).float()[0]
audio = audio[0].detach().cpu().numpy()
audio = audio / np.abs(audio).max()
now = datetime.now()
suffix_path = now.strftime("%H_%M_%S")
write("{}/{}.wav".format(args.output_dir, suffix_path))
|
However, results will probably be better if you fine tune. You can also fine tune the zero shot model using is_zero_shot = True, or fine tune a standard multispeaker model trained from scratch on LJ. It is probably better to fine tune the zero shot model. You can also train a two speaker model from scratch with data from LJ and the voice you are specifically creating.
If you load the model with "include_modules": "decatndpmvpredapm" you should be able to perform inference, but you'll probably need to train first (unless you load a pretrained set of attribute predictors). |
|
|
#in models/radrtts.py
#class RADTTS
def infer(...):
...
if dur is None:
# TODO (Sam): replace non-controllable is_available with controllable global setting. This is useful for debugging.
if torch.cuda.is_available():
z_dur = torch.cuda.FloatTensor(batch_size, 1, n_tokens)
else:
z_dur = torch.FloatTensor(batch_size, 1, n_tokens)
z_dur = z_dur.normal_() * sigma_dur
dur = self.dur_pred_layer.infer(
z_dur, txt_enc, spk_vec_text, lens=text_lengths
)
if dur.shape[-1] < txt_enc.shape[-1]:
to_pad = txt_enc.shape[-1] - dur.shape[2]
pad_fn = nn.ReplicationPad1d((0, to_pad))
dur = pad_fn(dur)
dur = dur[:, 0]
#dur=tensor([[[-0.2046, -0.1092, 0.0814, -0.1135, 0.2197, 0.0398, 0.4338, 0.3603, 0.3822, 0.2547, -0.0898, -0.0038]]], device='cuda:0')
dur = dur.clamp(0, token_duration_max)
#dur=tensor([[0.0000, 0.0000, 0.0814, 0.0000, 0.2197, 0.0398, 0.4338, 0.3603, 0.3822,0.2547, 0.0000, 0.0000]], device='cuda:0', grad_fn=<ClampBackward1>)
dur = dur * token_dur_scaling if token_dur_scaling > 0 else dur
#dur=tensor([[0.0000, 0.0000, 0.0814, 0.0000, 0.2197, 0.0398, 0.4338, 0.3603, 0.3822,0.2547, 0.0000, 0.0000]], device='cuda:0', grad_fn=<ClampBackward1>)
#token_dur_scaling=1
dur = (dur + 0.5).floor().int()
#dur=tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0', dtype=torch.int32)
out_lens = dur.sum(1).long().cpu() if dur.shape[0] != 1 else [dur.sum(1)] #out_lens=tensor([0], device='cuda:0')
max_n_frames = max(out_lens) #max_n_frames=tensor([0], device='cuda:0')
out_lens = torch.LongTensor(out_lens).to(txt_enc.device) #tensor([0], device='cuda:0')
txt_enc_time_expanded = self.length_regulator(
txt_enc.transpose(1, 2), dur
).transpose(1, 2)
#txt_enc_time_expanded torch.Szie([1, 512, 0])
#text_enc torch.Size(1, 512, 12)
#dur=tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0', dtype=torch.int32)
... Traceback (most recent call last):
File "./uberduck-ml-dev/uberduck_ml_dev/exec/inference.py", line 159, in <module>
model_output = model.infer(speaker_id, text, sigma=0.8)
File "./uberduck-ml-dev/uberduck_ml_dev/models/radtts.py", line 778, in infer
voiced_mask = self.v_pred_module.infer(
File "./uberduck-ml-dev/uberduck_ml_dev/models/components/attribute_prediction_model.py", line 137, in infer
x_hat = self.forward(txt_enc, spk_emb, x=None, lens=lens)["x_hat"]
File "./uberduck-ml-dev/uberduck_ml_dev/models/components/attribute_prediction_model.py", line 127, in forward
txt_enc = self.bottleneck_layer(txt_enc)
File ".conda/envs/test-env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "./uberduck-ml-dev/uberduck_ml_dev/models/components/attribute_prediction_model.py", line 99, in forward
x = self.projection_fn(x)
File ".conda/envs/test-env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "./uberduck-ml-dev/uberduck_ml_dev/models/common.py", line 1521, in forward
conv_signal = self.conv(signal)
File ".conda/envs/test-env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1208, in _call_impl
result = forward_call(*input, **kwargs)
File ".conda/envs/test-env/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 313, in forward
return self._conv_forward(input, self.weight, self.bias)
File ".conda/envs/test-env/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 309, in _conv_forward
return F.conv1d(input, weight, bias, self.stride,
RuntimeError: Calculated padded input size per channel: (2). Kernel size: (3). Kernel size can't be greater than actual input size
|
Thank you for your open source work, but I seem to have not found the complete implementation of zero-shot TTS.
The text was updated successfully, but these errors were encountered: