-
Notifications
You must be signed in to change notification settings - Fork 0
/
references.bib
370 lines (338 loc) · 20.1 KB
/
references.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
@article{rosler_reducing_2021,
title = {Reducing Videoconferencing Fatigue through Facial Emotion Recognition},
volume = {13},
pages = {126},
number = {5},
journaltitle = {Future Internet},
shortjournal = {Future Internet},
author = {Rößler, Jannik and Sun, Jiachen and Gloor, Peter},
date = {2021},
note = {Publisher: Multidisciplinary Digital Publishing Institute}
}
@inproceedings{jung_joint_2015,
title = {Joint fine-tuning in deep neural networks for facial expression recognition},
eventtitle = {Proceedings of the {IEEE} international conference on computer vision},
pages = {2983--2991},
author = {Jung, Heechul and Lee, Sihaeng and Yim, Junho and Park, Sunjeong and Kim, Junmo},
date = {2015}
}
@article{ko_brief_2018,
title = {A brief review of facial emotion recognition based on visual information},
volume = {18},
pages = {401},
number = {2},
journaltitle = {sensors},
shortjournal = {sensors},
author = {Ko, Byoung Chul},
date = {2018},
note = {Publisher: Multidisciplinary Digital Publishing Institute}
}
@article{mollahosseini_affectnet_2017,
title = {Affectnet: A database for facial expression, valence, and arousal computing in the wild},
volume = {10},
issn = {1949-3045},
pages = {18--31},
number = {1},
journaltitle = {{IEEE} Transactions on Affective Computing},
shortjournal = {{IEEE} Transactions on Affective Computing},
author = {Mollahosseini, Ali and Hasani, Behzad and Mahoor, Mohammad H},
date = {2017},
note = {Publisher: {IEEE}}
}
@inproceedings{haq_speaker-dependent_2009,
title = {Speaker-dependent audio-visual emotion recognition.},
eventtitle = {{AVSP}},
pages = {53--58},
author = {Haq, Sanaul and Jackson, Philip {JB} and Edge, J},
date = {2009}
}
@article{swain_databases_2018,
title = {Databases, features and classifiers for speech emotion recognition: a review},
volume = {21},
issn = {1572-8110},
pages = {93--120},
number = {1},
journaltitle = {International Journal of Speech Technology},
shortjournal = {International Journal of Speech Technology},
author = {Swain, Monorama and Routray, Aurobinda and Kabisatpathy, Prithviraj},
date = {2018},
note = {Publisher: Springer}
}
@article{zeng_emoco_2019,
title = {{EmoCo}: Visual analysis of emotion coherence in presentation videos},
volume = {26},
issn = {1077-2626},
pages = {927--937},
number = {1},
journaltitle = {{IEEE} transactions on visualization and computer graphics},
shortjournal = {{IEEE} transactions on visualization and computer graphics},
author = {Zeng, Haipeng and Wang, Xingbo and Wu, Aoyu and Wang, Yong and Li, Quan and Endert, Alex and Qu, Huamin},
date = {2019},
note = {Publisher: {IEEE}}
}
@article{zvyagintsev_attention_2013,
title = {Attention and multisensory integration of emotions in schizophrenia},
volume = {7},
issn = {1662-5161},
pages = {674},
journaltitle = {Frontiers in human neuroscience},
shortjournal = {Frontiers in human neuroscience},
author = {Zvyagintsev, Mikhail and Parisi, Carmen and Chechko, Natalia and Nikolaev, Andrey R and Mathiak, Klaus},
date = {2013},
note = {Publisher: Frontiers}
}
@article{delle-vigne_subclinical_2014,
title = {Subclinical alexithymia modulates early audio-visual perceptive and attentional event-related potentials},
volume = {8},
issn = {1662-5161},
pages = {106},
journaltitle = {Frontiers in Human Neuroscience},
shortjournal = {Frontiers in Human Neuroscience},
author = {Delle-Vigne, Dyna and Kornreich, Charles and Verbanck, Paul and Campanella, Salvatore},
date = {2014},
note = {Publisher: Frontiers}
}
@article{livingstone_ryerson_2018,
title = {The Ryerson Audio-Visual Database of Emotional Speech and Song ({RAVDESS}): A dynamic, multimodal set of facial and vocal expressions in North American English},
volume = {13},
issn = {1932-6203},
pages = {e0196391},
number = {5},
journaltitle = {{PloS} one},
shortjournal = {{PloS} one},
author = {Livingstone, Steven R and Russo, Frank A},
date = {2018},
note = {Publisher: Public Library of Science San Francisco, {CA} {USA}}
}
@article{ekman_universal_1997,
title = {Universal facial expressions of emotion},
volume = {8},
pages = {27--46},
journaltitle = {Segerstrale U, P. Molnar P, eds. Nonverbal communication: Where nature meets culture},
shortjournal = {Segerstrale U, P. Molnar P, eds. Nonverbal communication: Where nature meets culture},
author = {Ekman, Paul and Keltner, Dacher},
date = {1997}
}
@article{jain_extended_2019,
title = {Extended deep neural network for facial emotion recognition},
volume = {120},
issn = {0167-8655},
pages = {69--74},
journaltitle = {Pattern Recognition Letters},
shortjournal = {Pattern Recognition Letters},
author = {Jain, Deepak Kumar and Shamsolmoali, Pourya and Sehdev, Paramjit},
date = {2019},
note = {Publisher: Elsevier}
}
@article{burkhardt_database_2005,
title = {A Database of German Emotional Speech},
abstract = {The article describes a database of emotional speech. Ten actors (5 female and 5 male) simulated the emotions, producing 10 German utterances (5 short and 5 longer sentences) which could be used in everyday communication and are interpretable in all applied emotions.},
pages = {4},
author = {Burkhardt, Felix and Paeschke, A and Rolfes, M and Sendlmeier, W and Weiss, B},
date = {2005},
langid = {english},
file = {Burkhardt et al. - 2005 - A Database of German Emotional Speech.pdf:/Users/phipag/Zotero/storage/SCD7C9H7/Burkhardt et al. - 2005 - A Database of German Emotional Speech.pdf:application/pdf}
}
@inproceedings{james_open_2018,
title = {An Open Source Emotional Speech Corpus for Human Robot Interaction Applications},
doi = {10.21437/Interspeech.2018-1349},
pages = {2768--2772},
author = {James, Jesin and Tian, Li and Watson, Catherine},
date = {2018-09-02},
file = {Full Text PDF:/Users/phipag/Zotero/storage/G5JDXG7M/James et al. - 2018 - An Open Source Emotional Speech Corpus for Human R.pdf:application/pdf}
}
@article{zhou_emotional_2021,
title = {Emotional Voice Conversion: Theory, Databases and {ESD}},
url = {http://arxiv.org/abs/2105.14762},
shorttitle = {Emotional Voice Conversion},
abstract = {In this paper, we first provide a review of the state-of-the-art emotional voice conversion research, and the existing emotional speech databases. We then motivate the development of a novel emotional speech database ({ESD}) that addresses the increasing research need. With this paper, the {ESD} database is now made available to the research community. The {ESD} database consists of 350 parallel utterances spoken by 10 native English and 10 native Chinese speakers and covers 5 emotion categories (neutral, happy, angry, sad and surprise). More than 29 hours of speech data were recorded in a controlled acoustic environment. The database is suitable for multi-speaker and cross-lingual emotional voice conversion studies. As case studies, we implement several state-of-the-art emotional voice conversion systems on the {ESD} database. This paper provides a reference study on {ESD} in conjunction with its release.},
journaltitle = {{arXiv}:2105.14762 [cs]},
author = {Zhou, Kun and Sisman, Berrak and Liu, Rui and Li, Haizhou},
urldate = {2021-07-14},
date = {2021-05-31},
eprinttype = {arxiv},
eprint = {2105.14762},
keywords = {Computer Science - Computation and Language},
file = {arXiv Fulltext PDF:/Users/phipag/Zotero/storage/A5K8CUJH/Zhou et al. - 2021 - Emotional Voice Conversion Theory, Databases and .pdf:application/pdf;arXiv.org Snapshot:/Users/phipag/Zotero/storage/5CGL2WXU/2105.html:text/html}
}
@article{pichora-fuller_toronto_2020,
title = {Toronto emotional speech set ({TESS})},
url = {https://dataverse.scholarsportal.info/dataset.xhtml?persistentId=doi:10.5683/SP2/E8H2MF},
doi = {10.5683/SP2/E8H2MF},
abstract = {These stimuli were modeled on the Northwestern University Auditory Test No. 6 ({NU}-6; Tillman \& Carhart, 1966). A set of 200 target words were spoke...},
author = {Pichora-Fuller, M. Kathleen and Dupuis, Kate},
urldate = {2021-07-14},
date = {2020-02-13},
langid = {english},
note = {Publisher: Scholars Portal Dataverse
Type: dataset},
file = {Snapshot:/Users/phipag/Zotero/storage/7RHIAKKP/dataset.html:text/html}
}
@inproceedings{de_carolis_engaged_2019,
title = {“Engaged Faces”: Measuring and Monitoring Student Engagement from Face and Gaze Behavior},
eventtitle = {{IEEE}/{WIC}/{ACM} International Conference on Web Intelligence-Companion Volume},
pages = {80--85},
author = {De Carolis, Berardina and D'Errico, Francesca and Macchiarulo, Nicola and Palestra, Giuseppe},
date = {2019}
}
@misc{onnx_runtime_developers_onnx_2021,
title = {{ONNX} Runtime},
url = {https://onnxruntime.ai/},
author = {{ONNX Runtime developers}},
date = {2021}
}
@book{damasio_descartes_2006,
location = {London},
title = {Descartes' Error: Emotion, Reason and the Human Brain},
isbn = {978-0-09-950164-0},
shorttitle = {Descartes' Error},
abstract = {In the centuries since Descartes famously proclaimed, 'I think, therefore I am,' science has often overlooked emotions as the source of a person's true being. Even modern neuroscience has tended until recently to concentrate on the cognitive aspects of brain function, disregarding emotions. This attitude began to change with the publication of Descartes' Error. Antonio Damasio challenged traditional ideas about the connection between emotions and rationality. In this wonderfully engaging book, Damasio takes the reader on a journey of scientific discovery through a series of case studies, demonstrating what many of us have long suspected: emotions are not a luxury, they are essential to rational thinking and to normal social behaviour.},
pagetotal = {352},
publisher = {Vintage},
author = {Damasio, Antonio},
date = {2006-07-06}
}
@article{tyng_influences_2017,
title = {The Influences of Emotion on Learning and Memory},
volume = {0},
issn = {1664-1078},
url = {https://www.frontiersin.org/articles/10.3389/fpsyg.2017.01454/full},
doi = {10.3389/fpsyg.2017.01454},
abstract = {Emotion has a substantial influence on the cognitive processes in humans, including perception, attention, learning, memory, reasoning, and problem solving. Emotion has a particularly strong influence on attention, especially modulating the selectivity of attention as well as motivating action and behavior. This attentional and executive control is intimately linked to learning processes, as intrinsically limited attentional capacities are better focused on relevant information. Emotion also facilitates encoding and helps retrieval of information efficiently. However, the effects of emotion on learning and memory are not always univalent, as studies have reported that emotion either enhances or impairs learning and long-term memory retention, depending on a range of factors. Recent neuroimaging findings have indicated that the amygdala and prefrontal cortex cooperate with the medial temporal lobe in an integrated manner that affords (i) the amygdala modulating memory consolidation; (ii) the prefrontal cortex mediating memory encoding and formation; and (iii) the hippocampus for successful learning and long-term memory retention. We also review the nested hierarchies of circular emotional control and cognitive regulation (bottom-up and top-down influences) within the brain to achieve optimal integration of emotional and cognitive processing. This review highlights a basic evolutionary approach to emotion to understand the effects of emotion on learning and memory and the functional roles played by various brain regions and their mutual interactions in relation to emotional processing. We also summarize the current state of knowledge on the impact of emotion on memory and map implications for educational settings. In addition to elucidating the memory-enhancing effects of emotion, neuroimaging findings extend our understanding of emotional influences on learning and memory processes; this knowledge may be useful for the design of effective educational curricula to provide a conducive learning environment for both traditional “live” learning in classrooms and “virtual” learning through online-based educational technologies.},
journaltitle = {Frontiers in Psychology},
shortjournal = {Front. Psychol.},
author = {Tyng, Chai M. and Amin, Hafeez U. and Saad, Mohamad N. M. and Malik, Aamir S.},
urldate = {2021-07-30},
date = {2017},
note = {Publisher: Frontiers},
keywords = {Amygdala, Arousal, emotional valence, Learning, Medial temporal lobe ({MTL}), Memory, Neuroimaging, Personality, prefrontal cortex ({PFC})},
file = {Volltext:/Users/phipag/Zotero/storage/9XEVDHUP/Tyng et al. - 2017 - The Influences of Emotion on Learning and Memory.pdf:application/pdf}
}
@book{gallo_talk_2014,
title = {Talk like {TED}: the 9 public-speaking secrets of the world's top minds},
isbn = {978-1-4472-6113-1},
shorttitle = {Talk like {TED}},
publisher = {St. Martin's Press},
author = {Gallo, Carmine},
date = {2014},
file = {Full Text:/Users/phipag/Zotero/storage/7AER9WYQ/Gallo - 2014 - Talk like TED the 9 public-speaking secrets of th.pdf:application/pdf;Snapshot:/Users/phipag/Zotero/storage/UTIVLCLS/books.html:text/html}
}
@inproceedings{yang_wider_2016,
title = {Wider face: A face detection benchmark},
shorttitle = {Wider face},
pages = {5525--5533},
booktitle = {Proceedings of the {IEEE} conference on computer vision and pattern recognition},
author = {Yang, Shuo and Luo, Ping and Loy, Chen-Change and Tang, Xiaoou},
date = {2016},
file = {Full Text:/Users/phipag/Zotero/storage/TUYJ5ML7/Yang et al. - 2016 - Wider face A face detection benchmark.pdf:application/pdf;Snapshot:/Users/phipag/Zotero/storage/E245CEI5/Yang_WIDER_FACE_A_CVPR_2016_paper.html:text/html}
}
@article{kong_panns_2020,
title = {Panns: Large-scale pretrained audio neural networks for audio pattern recognition},
volume = {28},
issn = {2329-9290},
pages = {2880--2894},
journaltitle = {{IEEE}/{ACM} Transactions on Audio, Speech, and Language Processing},
shortjournal = {{IEEE}/{ACM} Transactions on Audio, Speech, and Language Processing},
author = {Kong, Qiuqiang and Cao, Yin and Iqbal, Turab and Wang, Yuxuan and Wang, Wenwu and Plumbley, Mark D},
date = {2020},
note = {Publisher: {IEEE}}
}
@article{he_deep_2015,
title = {Deep Residual Learning for Image Recognition},
volume = {abs/1512.03385},
url = {http://arxiv.org/abs/1512.03385},
journaltitle = {{CoRR}},
author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
date = {2015},
note = {\_eprint: 1512.03385}
}
@article{ioffe_batch_2015,
title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
volume = {abs/1502.03167},
url = {http://arxiv.org/abs/1502.03167},
journaltitle = {{CoRR}},
author = {Ioffe, Sergey and Szegedy, Christian},
date = {2015},
note = {\_eprint: 1502.03167}
}
@article{derrico_tracking_2019,
title = {Tracking a leader’s humility and its emotions from body, face and voice},
volume = {17},
doi = {10.3233/WEB-190401},
pages = {63--74},
journaltitle = {Web Intelligence and Agent Systems},
author = {D'Errico, Francesca and Poggi, Isabella},
date = {2019-02}
}
@inproceedings{chen_towards_2014,
location = {New York, {NY}, {USA}},
title = {Towards Automated Assessment of Public Speaking Skills Using Multimodal Cues},
isbn = {978-1-4503-2885-2},
url = {https://doi.org/10.1145/2663204.2663265},
doi = {10.1145/2663204.2663265},
series = {{ICMI} '14},
abstract = {Traditional assessments of public speaking skills rely on human scoring. We report an initial study on the development of an automated scoring model for public speaking performances using multimodal technologies. Task design, rubric development, and human rating were conducted according to standards in educational assessment. An initial corpus of 17 speakers with 4 speaking tasks was collected using audio, video, and 3D motion capturing devices. A scoring model based on basic features in the speech content, speech delivery, and hand, body, and head movements significantly predicts human rating, suggesting the feasibility of using multimodal technologies in the assessment of public speaking skills.},
pages = {200--203},
booktitle = {Proceedings of the 16th International Conference on Multimodal Interaction},
publisher = {Association for Computing Machinery},
author = {Chen, Lei and Feng, Gary and Joe, Jilliam and Leong, Chee Wee and Kitchen, Christopher and Lee, Chong Min},
date = {2014},
note = {event-place: Istanbul, Turkey},
keywords = {body tracking, educational applications, multimodal corpus, multimodal presentation assessment, public speaking}
}
@inproceedings{krizhevsky_imagenet_2012,
location = {Red Hook, {NY}, {USA}},
title = {{ImageNet} Classification with Deep Convolutional Neural Networks},
series = {{NIPS}'12},
abstract = {We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the {ImageNet} {LSVRC}-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5\% and 17.0\% which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully-connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient {GPU} implementation of the convolution operation. To reduce overriding in the fully-connected layers we employed a recently-developed regularization method called "dropout" that proved to be very effective. We also entered a variant of this model in the {ILSVRC}-2012 competition and achieved a winning top-5 test error rate of 15.3\%, compared to 26.2\% achieved by the second-best entry.},
pages = {1097--1105},
booktitle = {Proceedings of the 25th International Conference on Neural Information Processing Systems - Volume 1},
publisher = {Curran Associates Inc.},
author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.},
date = {2012},
note = {event-place: Lake Tahoe, Nevada}
}
@article{howard_mobilenets_2017,
title = {{MobileNets}: Efficient Convolutional Neural Networks for Mobile Vision Applications},
volume = {abs/1704.04861},
url = {http://arxiv.org/abs/1704.04861},
journaltitle = {{CoRR}},
author = {Howard, Andrew G. and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig},
date = {2017},
note = {\_eprint: 1704.04861}
}
@article{griffin_signal_1984,
title = {Signal estimation from modified short-time Fourier transform},
volume = {32},
pages = {236--243},
number = {2},
journaltitle = {{IEEE} Transactions on acoustics, speech, and signal processing},
author = {Griffin, Daniel and Lim, Jae},
date = {1984},
note = {Publisher: {IEEE}}
}
@inproceedings{mcfee_librosa_2015,
location = {Austin, Texas},
title = {librosa: Audio and Music Signal Analysis in Python},
volume = {14},
url = {https://conference.scipy.org/proceedings/scipy2015/brian_mcfee.html},
doi = {10.25080/Majora-7b98e3ed-003},
shorttitle = {librosa},
abstract = {This document describes version 0.4.0 of librosa: a Python package for audio and music signal processing. At a high level, librosa provides implementations of a variety of common functions used throughout the field of music information retrieval. In this document, a brief overview of the library’s functionality is provided, along with explanations of the design goals, software development practices, and notational conventions.},
eventtitle = {Python in Science Conference},
pages = {18--24},
author = {{McFee}, Brian and Raffel, Colin and Liang, Dawen and Ellis, Daniel and {McVicar}, Matt and Battenberg, Eric and Nieto, Oriol},
urldate = {2021-08-04},
date = {2015},
langid = {english},
file = {McFee et al. - 2015 - librosa Audio and Music Signal Analysis in Python.pdf:/Users/phipag/Zotero/storage/UFFXE95G/McFee et al. - 2015 - librosa Audio and Music Signal Analysis in Python.pdf:application/pdf}
}
@article{codd_further_1972,
title = {Further normalization of the data base relational model},
volume = {6},
pages = {33--64},
journaltitle = {Data base systems},
author = {Codd, Edgar F},
date = {1972},
note = {Publisher: Prentice-Hall Englewood Cliffs, {NJ}}
}