diff --git a/src/infer_pack/models.py b/src/infer_pack/models.py index ac9e7078..c955bba0 100644 --- a/src/infer_pack/models.py +++ b/src/infer_pack/models.py @@ -14,9 +14,10 @@ from src.infer_pack.commons import init_weights, get_padding, sequence_mask, rand_slice_segments, slice_segments2 -class TextEncoder256(nn.Module): +class TextEncoder(nn.Module): def __init__( self, + input_dim out_channels, hidden_channels, filter_channels, @@ -34,7 +35,7 @@ def __init__( self.n_layers = n_layers self.kernel_size = kernel_size self.p_dropout = p_dropout - self.emb_phone = nn.Linear(256, hidden_channels) + self.emb_phone = nn.Linear(input_dim, hidden_channels) self.lrelu = nn.LeakyReLU(0.1, inplace=True) if f0 == True: self.emb_pitch = nn.Embedding(256, hidden_channels) @@ -57,49 +58,6 @@ def forward(self, phone, pitch, lengths): return m, logs, x_mask -class TextEncoder768(nn.Module): - def __init__( - self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.emb_phone = nn.Linear(768, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) - self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths): - if pitch is None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) - x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype) - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - class ResidualCouplingBlock(nn.Module): def __init__( self, @@ -408,88 +366,10 @@ def remove_weight_norm(self): } -class SynthesizerTrnMs256NSFsid(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - **kwargs - ): - super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) - self.dec = GeneratorNSF(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"]) - self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward(self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds): - g = self.emb_g(ds).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) - pitchf = slice_segments2(pitchf, ids_slice, self.segment_size) - o = self.dec(z_slice, pitchf, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] - nsff0 = nsff0[:, -head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, nsff0, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs768NSFsid(nn.Module): +class Synthesizer(nn.Module): def __init__( self, + input_dim, spec_channels, segment_size, inter_channels, @@ -530,7 +410,7 @@ def __init__( self.segment_size = segment_size self.gin_channels = gin_channels self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder768(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) + self.enc_p = TextEncoder(input_dim, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) self.dec = GeneratorNSF(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"]) self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels) @@ -566,84 +446,10 @@ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): return o, x_mask, (z, z_p, m_p, logs_p) -class SynthesizerTrnMs256NSFsid_nono(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr=None, - **kwargs - ): - super().__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=False) - self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) - self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward(self, phone, phone_lengths, y, y_lengths, ds): - g = self.emb_g(ds).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) - o = self.dec(z_slice, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, sid, rate=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs768NSFsid_nono(nn.Module): +class Synthesizer_nono(nn.Module): def __init__( self, + input_dim, spec_channels, segment_size, inter_channels, @@ -682,7 +488,7 @@ def __init__( self.segment_size = segment_size self.gin_channels = gin_channels self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder768(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=False) + self.enc_p = TextEncoder(input_dim, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=False) self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels)