forked from cnlinxi/book-text-to-speech
-
Notifications
You must be signed in to change notification settings - Fork 3
/
text_to_speech.toc
305 lines (305 loc) · 17.7 KB
/
text_to_speech.toc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {1}语音合成概述}{1}{chapter.1}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.1}背景和概述}{1}{section.1.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.1.1}背景介绍}{1}{subsection.1.1.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.1.2}语音合成概述}{1}{subsection.1.1.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.1.3}语音合成的历史}{1}{subsection.1.1.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.1.4}当代语音合成框架}{3}{subsection.1.1.4}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.2}Awesome List}{4}{section.1.2}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.3}参考书籍}{4}{section.1.3}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.4}语音相关的会议、期刊、比赛和公司}{4}{section.1.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.1}会议}{4}{subsection.1.4.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.2}期刊}{5}{subsection.1.4.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.3}最新论文}{5}{subsection.1.4.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.4}比赛}{5}{subsection.1.4.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.5}公司}{5}{subsection.1.4.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.6}微信公众号}{5}{subsection.1.4.6}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.5}开源资料}{6}{section.1.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.1}中文数据集}{6}{subsection.1.5.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.2}英文数据集}{6}{subsection.1.5.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.3}情感数据集}{7}{subsection.1.5.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.4}其它数据集}{7}{subsection.1.5.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.5}开源工具}{7}{subsection.1.5.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.6}开源项目}{7}{subsection.1.5.6}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.6}语音合成评价指标}{8}{section.1.6}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.7}平均意见得分的测评要求与方法}{9}{section.1.7}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.7.1}实验要求}{9}{subsection.1.7.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.7.2}实验方法}{9}{subsection.1.7.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.7.3}实验步骤}{9}{subsection.1.7.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.7.4}实验设计}{9}{subsection.1.7.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.7.5}实验数据处理}{10}{subsection.1.7.5}%
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {2}语音信号基础}{11}{chapter.2}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.1}参考资料}{11}{section.2.1}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.2}语音基本概念}{11}{section.2.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.1}能量}{11}{subsection.2.2.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.2}短时能量}{11}{subsection.2.2.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.3}声强和声强级}{11}{subsection.2.2.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.4}响度}{11}{subsection.2.2.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.5}过零率}{12}{subsection.2.2.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.6}共振峰}{12}{subsection.2.2.6}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.7}基频和基音周期}{12}{subsection.2.2.7}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.8}音高}{13}{subsection.2.2.8}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.9}MFCC和语谱图}{14}{subsection.2.2.9}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.3}语言学}{14}{section.2.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.3.1}国际音标简介}{15}{subsection.2.3.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.3.2}IPA的字母和发音}{15}{subsection.2.3.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.3.3}音系学}{17}{subsection.2.3.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.3.4}参考资料}{17}{subsection.2.3.4}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.4}音频格式}{18}{section.2.4}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.5}数字信号处理}{18}{section.2.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.5.1}模数转换}{18}{subsection.2.5.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.5.2}频谱泄露}{18}{subsection.2.5.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.5.3}频率分辨率}{18}{subsection.2.5.3}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.6}其它概念}{18}{section.2.6}%
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {3}语音特征提取}{20}{chapter.3}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.1}预处理}{20}{section.3.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.1.1}预加重}{20}{subsection.3.1.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.1.2}分帧}{20}{subsection.3.1.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.1.3}加窗}{20}{subsection.3.1.3}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.2}短时傅里叶变换}{21}{section.3.2}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.3}听觉特性}{22}{section.3.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.3.1}梅尔滤波}{22}{subsection.3.3.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.3.2}Bark滤波}{23}{subsection.3.3.2}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.4}倒谱分析}{23}{section.3.4}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.5}常见的声学特征}{24}{section.3.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.1}FBank}{24}{subsection.3.5.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.2}MFCC}{25}{subsection.3.5.2}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.6}具体操作}{25}{section.3.6}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.1}利用librosa读取音频}{25}{subsection.3.6.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.2}提取梅尔频谱}{26}{subsection.3.6.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.3}提取MFCC}{27}{subsection.3.6.3}%
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {4}音库制作和文本前端}{29}{chapter.4}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.1}音库制作}{29}{section.4.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.1}音库制作概述}{29}{subsection.4.1.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.2}发音人选型}{29}{subsection.4.1.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.3}录音文本收集}{29}{subsection.4.1.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.4}音频录制}{29}{subsection.4.1.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.5}语料整理}{29}{subsection.4.1.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.6}标注}{30}{subsection.4.1.6}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.2}文本前端}{31}{section.4.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.1}文本前端在语音合成中扮演的角色}{31}{subsection.4.2.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.2}文本前端的主要组成}{31}{subsection.4.2.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.3}文本规范化}{31}{subsection.4.2.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.4}分词}{31}{subsection.4.2.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.5}文本转音素}{32}{subsection.4.2.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.6}韵律分析}{32}{subsection.4.2.6}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.7}文本前端的工程实现}{33}{subsection.4.2.7}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.3}总结}{33}{section.4.3}%
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {5}声学模型}{34}{chapter.5}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.1}Tacotron}{34}{section.5.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.1.1}Tacotron-2简介}{34}{subsection.5.1.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.1.2}声学特征建模网络}{35}{subsection.5.1.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.1.3}损失函数}{36}{subsection.5.1.3}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.2}FastSpeech}{36}{section.5.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.1}模型结构}{36}{subsection.5.2.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.2}损失函数}{41}{subsection.5.2.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.3}小结}{41}{subsection.5.2.3}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.3}VITS}{41}{section.5.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.1}模型整体结构}{42}{subsection.5.3.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.2}变分推断}{42}{subsection.5.3.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.3}对齐估计}{43}{subsection.5.3.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.4}对抗训练}{43}{subsection.5.3.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.5}总体损失}{44}{subsection.5.3.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.6}总结}{44}{subsection.5.3.6}%
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {6}声码器}{45}{chapter.6}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.1}Griffin-Lim声码器}{45}{section.6.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.1.1}算法原理}{45}{subsection.6.1.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.1.2}代码实现}{46}{subsection.6.1.2}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.2}STRAIGHT声码器}{46}{section.6.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.1}概述}{46}{subsection.6.2.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.2}特征提取}{47}{subsection.6.2.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.3}语音合成}{47}{subsection.6.2.3}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.3}WORLD声码器}{47}{section.6.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.1}声学特征}{47}{subsection.6.3.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.2}WORLD的分析功能}{47}{subsection.6.3.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.3}DIO算法提取基频F0}{48}{subsection.6.3.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.4}CheapTrick算法提取频谱包络SP}{49}{subsection.6.3.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.5}PLANTINUM提取非周期信号}{49}{subsection.6.3.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.6}WORLD的合成算法}{50}{subsection.6.3.6}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.7}使用示例}{50}{subsection.6.3.7}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.4}Neural Homomorphic Vocoder (NHV)}{51}{section.6.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.1}源滤波器合成原理}{51}{subsection.6.4.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.2}共振峰合成方法}{52}{subsection.6.4.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.3}NHV概述}{52}{subsection.6.4.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.4}整体结构}{52}{subsection.6.4.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.5}脉冲串生成器}{53}{subsection.6.4.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.6}神经网络滤波估计器(Neural Network Filter Estimator)}{53}{subsection.6.4.6}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.7}线性时变(LTV)滤波器和可训练的有限冲激响应(FIRs)}{54}{subsection.6.4.7}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.8}神经网络的训练}{55}{subsection.6.4.8}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.9}小结}{55}{subsection.6.4.9}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.5}HiFiGAN}{55}{section.6.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.1}HiFiGAN概述}{55}{subsection.6.5.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.2}HiFiGAN生成器简介}{55}{subsection.6.5.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.3}上采样结构}{56}{subsection.6.5.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.4}多感受野融合}{57}{subsection.6.5.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.5}HiFiGAN判别器简介}{59}{subsection.6.5.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.6}多尺度判别器}{59}{subsection.6.5.6}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.7}多周期判别器}{61}{subsection.6.5.7}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.8}损失函数简介}{62}{subsection.6.5.8}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.9}生成对抗损失}{62}{subsection.6.5.9}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.10}梅尔频谱损失}{63}{subsection.6.5.10}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.11}特征匹配损失}{64}{subsection.6.5.11}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.12}整体损失}{64}{subsection.6.5.12}%
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {7}语音合成知识结构}{66}{chapter.7}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.1}基本组成}{66}{section.7.1}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.2}声音转换}{67}{section.7.2}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.3}多语种语音合成}{67}{section.7.3}%
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.4}进阶}{67}{section.7.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.1}高表现力语音合成}{68}{subsection.7.4.1}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.2}个性化语音合成}{68}{subsection.7.4.2}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.3}方言迁移}{68}{subsection.7.4.3}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.4}歌唱合成}{68}{subsection.7.4.4}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.5}语音合成的稳定性}{69}{subsection.7.4.5}%
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.6}语音合成的效率优化}{69}{subsection.7.4.6}%