From e86dd180bc3f642357067910b3f6d90d4d062ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Ren=C3=A9=20Sch=C3=A4dler?= Date: Mon, 22 Feb 2021 13:19:49 +0100 Subject: [PATCH] Update README.md and manuscript And remove some redundant code --- README.md | 179 +++--- fade/evaluation/play_evaluate.m | 7 - fade/features/sgbfb-abel-reduced/hl2spl.m | 7 +- .../sgbfb-abel-reduced/log_mel_spectrogram.m | 174 +----- fade/features/sgbfb-abel-reduced/mvn.m | 12 +- manuscript/images/platt-diagramm.pdf | Bin 0 -> 15409 bytes manuscript/ms.tex | 577 +++++++++--------- 7 files changed, 391 insertions(+), 565 deletions(-) mode change 100644 => 120000 fade/features/sgbfb-abel-reduced/hl2spl.m mode change 100644 => 120000 fade/features/sgbfb-abel-reduced/log_mel_spectrogram.m mode change 100644 => 120000 fade/features/sgbfb-abel-reduced/mvn.m create mode 100644 manuscript/images/platt-diagramm.pdf diff --git a/README.md b/README.md index 01161f2..9698f10 100644 --- a/README.md +++ b/README.md @@ -35,89 +35,102 @@ You will need: ## Data structure of this repository Here is an overview of the most important files and folders in this repository. -Please note that there are many symbolic links which are not listed here. - -* ***data/*** (scripts and signals needed for stimulus generation, used by EMA and FADE) - * *matrix/* (matrix sentence test and maskers signal) - * *maskers/* (prepared masker signals) - * *source/* (source masker signals) - * prepare_signals.sh (script to resample masker signals with sox) - * *speech/* (speech signals) - * *default/* (prepared speech signals) - * *source/* (source speech signals) - * prepare_signals.sh (script to resample speech signals with sox) - * *processing/* (files for openMHA-based signal processing) - * *platt/* (PLATT reference configuration) - * generate_processings.sh (script to generate PLATT-1 to PLATT-8 variants from reference configuration) -* ***ema/*** (files for measurements with the Essential Measurement Applications, not needed for simulations with FADE) - * ema.sh (main script to run the Essential Measurement Applications) -* ***fade/*** (files for simulations of measurements with FADE) - * *evaluation/* (scripts to evaluate the simulation results) - * play_evaluate.m (script to plot Plomp curves) - * play_psyfun.m (script to plot psychometric functions) - * play_tables.m (script to generate average improvements table) - * results.txt (simulation results) - * psyfun-data.txt (psychometric functions of selected simulation results) - * *features/sgbfb-abel-full/* (feature extraction) - * feature_extraction.m (main feature extraction script, implements level uncertainty and limited frequency range) - * fade_simulate.sh (script to set up and run one FADE simulation) - * run_experiments.sh (script that runs all matrix sentence FADE simulations) - * snippets.txt (misc BASH code fragements, e.g., to read out psychometric functions from FADE experiments) -* ***loop/*** (JACK plugin that can be used to perform headphone compensation) - * 'tools/'update_configuration.m (script to generate compensation filters) -* ***manuscript/*** (manuscript source files) - * ms.tex (main LaTeX file, to be compiled with pdflatex) -* ***platt/*** (PLATT implementation and configuration scripts) - * *src/* (source code of PLATT implementation) - * *configuration/* (output directory of binary configuration files) - * *core/* (PLATT C routines) - * *jack/* (PLATT JACK plugin) - * *octave/* (PLATT octave wrapper functions and demo script) - * play_demo.m (extensive commented demo script) - * *tools/* (scripts to generate and update binary PLATT configuration files) - * configuration.m (PLATT "user" configuration file) - * mel_gammatone_iir.m (script to calculate filter bank coefficients used by PLATT for the signal analysis/resynthesis) - * play_mel_gammatone_demo.m (script to demonstrate properties of filter bank with figures) - * set_configuration.m (script to complete PLATT "user" configuration) - * write_configuration.m (script to write binary configuration files) - * update_configuration.m (script to read "user" configuration from configuration.m and write binary configuration to ../src/configuration) - * live.sh (script to start PLATT on mobile hearing aid prototype) -* ***make.sh*** (script to compile and set up tools in *loop/* and *platt/*) - +Please note that there are many symbolic links which are not listed. +``` +. +├── data/ (scripts and signals needed for stimulus generation, used by EMA and FADE) +│ ├── matrix/ (matrix sentence test and maskers signal) +│ │ ├── maskers/ (prepared masker signals) +│ │ ├── source/ (source masker signals) +│ │ ├── prepare_signals.sh (script to resample masker signals with sox) +│ │ └── speech/ (speech signals) +│ │ ├── default/ (prepared speech signals) +│ │ ├── source/ (source speech signals) +│ │ └── prepare_signals.sh (script to resample speech signals with sox) +│ │ +│ └── processing/ (files for openMHA-based signal processing) +│ ├── platt/ (PLATT reference configuration) +│ └── generate_processings.sh (script to generate PLATT-1 to PLATT-8 variants from reference configuration) +│ +├── ema/ (files for measurements with the Essential Measurement Applications, not needed for simulations with FADE) +│ └── ema.sh (main script to run the Essential Measurement Applications) +│ +├── fade/ (files for simulations of measurements with FADE) +│ ├── evaluation/ (scripts to evaluate the simulation results) +│ │ ├── play_evaluate.m (script to plot Plomp curves) +│ │ ├── play_psyfun.m (script to plot psychometric functions) +│ │ ├── play_tables.m (script to generate average improvements table) +│ │ ├── results.txt (simulation results) +│ │ └── psyfun-data.txt (psychometric functions of selected simulation results) +│ │ +│ ├── features/ (feature extraction implementation) +│ │ └── sgbfb-abel-full/ +│ │ └── feature_extraction.m (main feature extraction script, implements level uncertainty and limited frequency range) +│ │ +│ ├── fade_simulate.sh (script to set up and run one FADE simulation) +│ ├── run_experiments.sh (script that runs all matrix sentence FADE simulations) +│ └── snippets.txt (misc BASH code fragements, e.g., to read out psychometric functions from FADE experiments) +│ +├── loop/ (JACK plugin that can be used to perform headphone compensation) +│ └── tools/ update_configuration.m (script to generate compensation filters) +│ +├── manuscript/ (manuscript source files) +│ └── ms.tex (main LaTeX file, to be compiled with pdflatex) +│ +├── platt/ (PLATT implementation and configuration scripts) +│ ├── src/ (source code of PLATT implementation) +│ │ ├── configuration/ (output directory of binary configuration files) +│ │ ├── core/ (PLATT C routines) +│ │ ├── jack/ (PLATT JACK plugin) +│ │ └── octave/ (PLATT octave wrapper functions and demo script) +│ │ └── play_demo.m (extensive commented demo script) +│ │ +│ ├── tools/ (scripts to generate and update binary PLATT configuration files) +│ │ ├── configuration.m (PLATT "user" configuration file) +│ │ ├── mel_gammatone_iir.m (script to calculate filter bank coefficients used by PLATT for the signal analysis/resynthesis) +│ │ ├── play_mel_gammatone_demo.m (script to demonstrate properties of filter bank with figures) +│ │ ├── set_configuration.m (script to complete PLATT "user" configuration) +│ │ ├── write_configuration.m (script to write binary configuration files) +│ │ └── update_configuration.m (script to read "user" configuration from configuration.m and write binary configuration to ../src/configuration) +│ │ +│ └── live.sh (script to start PLATT on mobile hearing aid prototype) +│ +└── make.sh (script to compile and set up tools in loop/ and platt/) +``` ## Initial set up Run - -> ./make.sh - -in the root directory of the repository to compile and setup the tools in *loop/* and *platt/*. +``` +./make.sh +``` +in the root directory of the repository to compile and setup the tools in `loop/` and `platt/`. However, for this command to succeed, the line that starts with "error(" in loop/tools/update_configuration.m has to be deleted or commented out. If you are only interested in the simulations with FADE, just remove the line. The file controls the calibration/compensation of the playback (not used by FADE, only by EMA). Run - -> ./generate_processings.sh - -in the folder *data/processing/* to generate the binary configuration files for PLATT-1 to PLATT-8. +``` +./generate_processings.sh +``` +in the folder `data/processing/` to generate the binary configuration files for PLATT-1 to PLATT-8. Further, for the simulations with FADE, the matrix sentence signals and noise masker signals need to be prepared. FADE interprets a root-mean-square (RMS) of 1 as 130dB SPL. Speech and noise files are expected to scaled to 65dB SPL, that is -65dB FS. You need to scale the source material accordingly (and don't forget to save the result with 32 bits per sample). -The matrix sentence speech files need to be placed in *data/matrix/speech/source/*. -Then run the script `prepare_signals.sh` in *data/matrix/speech/*. +The matrix sentence speech files need to be placed in `data/matrix/speech/source/`. +Then run the script `prepare_signals.sh` in `data/matrix/speech/`. -The noise masker files needs to be placed in *data/matrix/source/*. -Then run the script `prepare_signals.sh` in *data/matrix/*. +The noise masker files needs to be placed in `data/matrix/source/`. +Then run the script `prepare_signals.sh` in `data/matrix/`. The "prepare_signals.sh" scripts just resample the corresponding source signals to 48kHz with sox, because the Octave resample function had a [funny bug](https://savannah.gnu.org/bugs/?func=detailitem&item_id=59149). ## Mentionable details of FADE setup The simulations require a correct installation of FADE. -It is required that FADE was [added to the PATH environment variable}(https://github.com/m-r-s/fade/tree/2.4.0#fade). +It is required that FADE was [added to the PATH environment variable](https://github.com/m-r-s/fade/tree/2.4.0#fade). Also, the default parallel configuration of FADE will be used. Almost always, using the maximum number of available threads, as reported by `nproc`, is far from optimal. @@ -126,48 +139,48 @@ Probably I will write a script that automates this step some day. For now, you can look at the [corresponding FADE tutorial](https://github.com/m-r-s/fade/blob/2.4.0/tutorials/ADVANCED_PARALLELIZATION.md). For a system with a Ryzen 9 3900X CPU with 12/24 cores/threads and 64Gb RAM, the following parallel configuration was optimal in these experiments: - -> CORPUS_THREADS=8 -> PROCESSING_THREADS=24 -> FEATURES_THREADS=10 -> TRAINING_THREADS=16 -> RECOGNITION_THREADS=12 - +``` +CORPUS_THREADS=8 +PROCESSING_THREADS=24 +FEATURES_THREADS=10 +TRAINING_THREADS=16 +RECOGNITION_THREADS=12 +``` ## Simulations with FADE The main simulation script is `fade/run_experiments.sh`. You should now be able to run it, which would start the 1760 FADE simulations. To test if everything works, it is generally a good idea to start with one representative simulation which can be configured at the beginning of the `run_experiments.sh` script, for example: - -> MEASUREMENTS=( matrix,platt4-default,icra5,70,b ) -> INDIVIDUALS=( P-4000-14 ) - +``` +MEASUREMENTS=( matrix,platt4-default,icra5,70,b ) +INDIVIDUALS=( P-4000-14 ) +``` The simulations use the default ramdisk in Ubuntu Linux, for which approximately 24Gb space in /dev/shm are needed. This behavior can be changed with the variable WORKDIR in `fade/fade_simulate.sh`. Running - -> ./run_experiments.sh - -in the folder *fade/* will output sparse information on simulation progress. +``` +./run_experiments.sh +``` +in the folder `fade/` will output sparse information on simulation progress. The simulation log is saved to the file `simulation.log`. You can use the command `tail -f simulation.log` to follow the simulation log live in a separate terminal window. Finally, the simulation results will be collected to the results file `results.txt`. ## Evaluation of simulation results -The results file `results.txt` can be copied to the *fade/evaluation* folder. +The results file `results.txt` can be copied to the `fade/evaluation` folder. The `results.txt` from my simulations is provided. Now, the Octave scripts `play_evaluate.m` and `play_tables.m` can be used to generate the Plomp-curve figures and the table presented in the manuscript. ## Manuscript -The manuscript sources can be found in the folder *manuscript/* - -Run +The manuscript sources can be found in the folder `manuscript/` +Running +``` pdflatex ms.tex - -to generate a PDF document. +``` +in that folder will generate a PDF document. You will probably need to install the texlive-full package. diff --git a/fade/evaluation/play_evaluate.m b/fade/evaluation/play_evaluate.m index 397950d..42b0fe7 100644 --- a/fade/evaluation/play_evaluate.m +++ b/fade/evaluation/play_evaluate.m @@ -25,35 +25,30 @@ maskers = {'olnoise' 'icra5'}; maskers_strings = {'OLNOISE' 'ICRA5-250'}; -%% frequency range %profiles = {'P-8000-1' 'none'; % 'P-4000-1' 'none'; % 'P-2000-1' 'none'; % 'P-1000-1' 'none'; % }; -%% level uncertainty %profiles = {'P-8000-1' 'none'; % 'P-8000-7' 'none'; % 'P-8000-14' 'none'; % 'P-8000-21' 'none'; % }; -%% level uncertainty with frequency range limited to 1000 Hz %profiles = {'P-1000-1' 'none'; % 'P-1000-7' 'none'; % 'P-1000-14' 'none'; % 'P-1000-21' 'none'; % }; -%% limited frequency range with increased level uncertainty %profiles = {'P-8000-1' 'none'; % 'P-4000-7' 'none'; % 'P-2000-14' 'none'; % 'P-1000-21' 'none'; % }; -%% limited frequency range with increased level uncertainty %profiles = {'P-4000-7' 'none'; % 'P-4000-7' 'platt2'; % 'P-4000-7' 'platt4'; @@ -61,7 +56,6 @@ % 'P-8000-1' 'none'; % } -%% limited frequency range with increased level uncertainty %profiles = {'P-2000-14' 'none'; % 'P-2000-14' 'platt2'; % 'P-2000-14' 'platt4'; @@ -69,7 +63,6 @@ % 'P-8000-1' 'none'; % } -% limited frequency range with increased level uncertainty profiles = {'P-1000-21' 'none'; 'P-1000-21' 'platt2'; 'P-1000-21' 'platt4'; diff --git a/fade/features/sgbfb-abel-reduced/hl2spl.m b/fade/features/sgbfb-abel-reduced/hl2spl.m deleted file mode 100644 index 8a0eac4..0000000 --- a/fade/features/sgbfb-abel-reduced/hl2spl.m +++ /dev/null @@ -1,6 +0,0 @@ -function y = hl2spl(f, x) -% Hearing thresholds digitized from ISO226 Loudness Curves -f_ht = [20.7332 21.5443 21.9617 22.8209 23.7137 24.6415 25.1189 25.6055 26.1016 27.1227 27.6482 28.1838 28.7298 29.8538 30.4322 31.6228 32.2354 32.8599 33.4965 34.807 35.4813 36.8695 37.5837 38.3119 39.8107 40.582 41.3682 42.9866 43.8194 44.6684 45.5337 47.3151 48.2318 49.1662 51.0897 52.0795 53.0884 54.117 56.2341 57.3236 58.4341 59.5662 63.0957 65.5642 68.1292 69.4491 70.7946 74.9894 76.4422 77.9232 80.9717 85.7696 90.8518 94.4061 96.2351 103.912 107.978 114.376 121.153 128.332 133.352 138.569 141.254 152.522 155.477 161.56 171.133 177.828 188.365 203.392 207.332 215.443 228.209 237.137 246.415 256.055 266.073 276.482 287.298 298.538 322.354 361.687 383.119 390.541 421.697 455.337 482.318 520.795 541.17 584.341 607.202 643.181 655.642 668.344 721.661 779.232 794.328 809.717 874.312 944.061 962.351 1079.78 1122.02 1143.76 1165.91 1258.93 1359.36 1385.69 1412.54 1467.8 1525.22 1584.89 1615.6 1646.9 1744.48 1847.85 1920.14 2033.92 2073.32 2154.43 2282.09 2371.37 2417.32 2610.16 2818.38 2872.98 3043.22 3162.28 3414.55 3548.13 3686.95 3981.07 4058.2 4298.66 4553.37 4731.51 4916.62 5011.87 5207.95 5411.7 5623.41 5843.41 6072.02 6189.66 6309.57 6556.42 6812.92 7079.46 7356.42 7498.94 7943.28 8413.95 9261.19 9809.95 10592.5 11437.6 11659.1 12115.3 12589.3 13081.8 13856.9 14125.4 14678 16000 20000]; -spl_ht = [72.6727 71.4715 70.5706 69.3694 68.4685 67.2673 66.0661 65.7658 65.1652 63.964 63.0631 62.7628 61.8619 60.6607 59.4595 58.5586 57.6577 57.3574 56.1562 55.2553 54.0541 52.8529 52.2523 51.952 50.1502 49.5495 48.9489 48.048 46.8468 46.8468 45.9459 44.7447 44.1441 43.8438 42.6426 41.4414 41.4414 40.5405 39.3393 38.7387 38.4384 37.2372 36.036 34.8348 33.6336 33.3333 32.7327 31.5315 30.9309 30.6306 29.4294 28.5285 27.027 26.1261 26.1261 24.3243 23.4234 22.5225 21.3213 20.4204 19.5195 18.9189 18.6186 17.1171 16.8168 16.2162 15.3153 14.7147 13.8138 12.6126 12.3123 12.012 11.1111 10.8108 10.2102 9.90991 9.30931 9.00901 8.40841 8.10811 7.20721 6.30631 5.70571 5.70571 5.10511 4.5045 4.2042 3.9039 3.6036 3.3033 3.003 3.003 2.7027 2.7027 2.4024 2.4024 2.4024 2.1021 2.1021 2.1021 2.1021 2.1021 2.4024 2.4024 2.4024 3.003 3.3033 3.3033 3.3033 3.003 3.003 2.4024 2.1021 2.1021 1.2012 0 -0.900901 -1.8018 -2.4024 -3.003 -3.9039 -4.5045 -4.8048 -5.40541 -6.00601 -6.00601 -6.00601 -6.30631 -6.30631 -6.00601 -6.00601 -5.10511 -4.8048 -3.9039 -2.7027 -1.8018 -0.600601 -0.3003 0.3003 1.2012 2.4024 3.3033 4.2042 4.8048 5.40541 6.30631 7.50751 8.40841 9.60961 9.90991 11.4114 12.3123 13.8138 14.1141 14.7147 14.7147 15.015 14.7147 14.7147 14.4144 13.8138 13.8138 13.5135 12 100]; -y = x+interp1(f_ht,spl_ht,f,'linear'); -end \ No newline at end of file diff --git a/fade/features/sgbfb-abel-reduced/hl2spl.m b/fade/features/sgbfb-abel-reduced/hl2spl.m new file mode 120000 index 0000000..87e2d83 --- /dev/null +++ b/fade/features/sgbfb-abel-reduced/hl2spl.m @@ -0,0 +1 @@ +../sgbfb-abel-full/hl2spl.m \ No newline at end of file diff --git a/fade/features/sgbfb-abel-reduced/log_mel_spectrogram.m b/fade/features/sgbfb-abel-reduced/log_mel_spectrogram.m deleted file mode 100644 index 8a0d4e5..0000000 --- a/fade/features/sgbfb-abel-reduced/log_mel_spectrogram.m +++ /dev/null @@ -1,173 +0,0 @@ -function [log_mel_spec freq_centers] = log_mel_spectrogram(signal, fs, win_shift, win_length, freq_range, num_bands, band_factor) -% usage: log_mel_spec = log_mel_spectrogram(signal, fs) -% signal waveform signal -% fs sampling rate in Hz -% -% usage [log_mel_spec freq_centers] = log_mel_spectrogram(signal, fs, win_shift, win_length, freq_range, num_bands, band_factor) -% win_shift window shift in ms -% win_length window length in ms -% freq_range [lower upper] frequency -% num_bands number of Mel-bands in freq_range -% band_factor spectral super-sampling factor -% -% - Log Mel-spectrogram v1.1 - -% -% This script extracts spectro-temporal representations called -% "logarithmically scaled Mel-spectrograms" from audio signals. -% It roughly resembles basic auditory principles such as a limited spectral -% resolution and a compressive intensity perception. -% A detailed explanation is given in [1]. -% -% Copyright (C) 2015-2018 Marc René Schädler -% E-mail marc.r.schaedler@uni-oldenburg.de -% Institute Carl-von-Ossietzky University Oldenburg, Germany -% -%----------------------------------------------------------------------------- -% -% Release Notes: -% v1.0 - Inital release -% v1.1 - Add option for spectral super-sampling and increase upper frequency limit - -%% Default settings and checks - -% Make signal a row vector -assert(sum(size(signal) > 1) == 1, 'signal must be a vector'); -signal = signal(:).'; - -% Set the default window shift to 10 ms -if nargin < 3 || isempty(win_shift) - win_shift = 10; % ms -end - -% Set the default window length to 25 ms -if nargin < 4 || isempty(win_length) - win_length = 25; % ms -end - -% Set the default frequency range from 64Hz to fs/2 (max. 12kHz) -if nargin < 5 || isempty(freq_range) - freq_range = [64 min(floor(fs./2), 12000)]; -end - -% Cover the maximum frequency range with equally Mel-spaced filters -% this results in 23 Mel-bands for freq range = [64 4000] -if nargin < 6 || isempty(num_bands) - channel_dist = (hz2mel(4000) - hz2mel(64))./(23+1); % Distance between center frequencies in Mel - num_bands = floor((hz2mel(freq_range(2)) - hz2mel(freq_range(1)))./channel_dist)-1; - freq_range(2) = mel2hz(hz2mel(freq_range(1))+channel_dist.*(num_bands+1)); -end - -% Set the default band_factor to 1 -if nargin < 7 || isempty(band_factor) - band_factor = 1; -end - - -%% Calculation of Mel-spectrogram - -% Convert lengths and shifts to samples -M = round(win_shift./1000.*fs); -N = round(win_length./1000.*fs); -num_coeff = 2.^ceil(log2(N)); - -% Signal framing -num_frames = 1 + floor ((length(signal) - N) ./ M); -frames = zeros(N, num_frames); -for i=1:num_frames - frames(:,i) = signal(1+(i-1)*M:N+(i-1)*M); -end - -% Windowing -window_function = hamming(N); - -% Normalize root-mean-square to preserve energy -window_function = window_function ./ sqrt(mean(window_function.^2)); - -% Apply window function -signal_frame = bsxfun(@times, frames, window_function); - -% Calculate spectrum of each frame -spec = 1./num_coeff .* abs(fft(signal_frame, num_coeff, 1)); - -% Mel-transformation -freq_centers = mel2hz(linspace(hz2mel(freq_range(1)), hz2mel(freq_range(2)), (num_bands+1).*band_factor+1)); -mel_spec = triafbmat(fs, num_coeff, freq_centers, [1 1].*band_factor) * spec; - -% Return only real center frequencies -freq_centers = freq_centers(1+band_factor:end-band_factor); - - -%% Logarithmic compression - -% Relative to 0 dB SPL with lower limit at -20 dB SPL and upper limit at 130 dB SPL -log_mel_spec = max(-20,min(0,20.*log10(max(mel_spec, 0))) + 130); -end - - -function [transmat, freq_centers_idx] = triafbmat(fs, num_coeff, freq_centers, width) -% Generate a matrix that joins spectral bins via triangular filters - -% Caching whitelist (feel free to add Matlab versions) -caching = is_octave(); - -if caching - % Build a config id string - config = strrep(sprintf('c%.0f', [fs num_coeff freq_centers.*100 width]),'-','_'); - % Load cache - persistent cache; -end - -% Only generate Matrices which are not cached -if ~caching || isempty(cache) || ~isfield(cache, config) - width_left = width(1); - width_right = width(2); - freq_centers_idx = round(freq_centers./fs .* num_coeff); - num_bands = length(freq_centers)-(width_left+width_right); - transmat = zeros(num_bands, num_coeff); - for i=1:num_bands - left = freq_centers_idx(i); - center = freq_centers_idx(i+width_left); - right = freq_centers_idx(i+width_left+width_right); - start_raise = 0; - stop_raise = 1; - start_fall = 1; - stop_fall = 0; - if (left >= 1) - transmat(i,left:center) = linspace(start_raise, stop_raise, center-left+1); - end - if (right <= num_coeff) - transmat(i,center:right) = linspace(start_fall, stop_fall, right-center+1); - end - end - if caching - % Save to cache - cache.(config).transmat = transmat; - cache.(config).freq_centers_idx = freq_centers_idx; - end -else - % Load from cache - transmat = cache.(config).transmat; - freq_centers_idx = cache.(config).freq_centers_idx; -end -end - - -function f = mel2hz (m) -% Convert frequency from Mel to Hz -f = 700.*((10.^(m./2595))-1); -end - - -function m = hz2mel (f) -% Convert frequency from Hz to Mel -m = 2595.*log10(1+f./700); -end - - -function r = is_octave () - persistent x; - if (isempty (x)) - x = exist ('OCTAVE_VERSION', 'builtin'); - end - r = x; -end diff --git a/fade/features/sgbfb-abel-reduced/log_mel_spectrogram.m b/fade/features/sgbfb-abel-reduced/log_mel_spectrogram.m new file mode 120000 index 0000000..76abcf3 --- /dev/null +++ b/fade/features/sgbfb-abel-reduced/log_mel_spectrogram.m @@ -0,0 +1 @@ +../sgbfb-abel-full/log_mel_spectrogram.m \ No newline at end of file diff --git a/fade/features/sgbfb-abel-reduced/mvn.m b/fade/features/sgbfb-abel-reduced/mvn.m deleted file mode 100644 index 22c67e7..0000000 --- a/fade/features/sgbfb-abel-reduced/mvn.m +++ /dev/null @@ -1,11 +0,0 @@ -function out = mvn(in) -% usage out = mean_variance_norm(in) -% -% Perform mean and variance normalization of each row of 'in' -% -% Copyright (C) 2015 Marc René Schädler -% - -out = bsxfun(@minus, in, mean(in,2)); -out = bsxfun(@times, out, 1./sqrt(mean(out.^2,2))); -end diff --git a/fade/features/sgbfb-abel-reduced/mvn.m b/fade/features/sgbfb-abel-reduced/mvn.m new file mode 120000 index 0000000..7989571 --- /dev/null +++ b/fade/features/sgbfb-abel-reduced/mvn.m @@ -0,0 +1 @@ +../sgbfb-abel-full/mvn.m \ No newline at end of file diff --git a/manuscript/images/platt-diagramm.pdf b/manuscript/images/platt-diagramm.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3772108b04d78fbb597807cc21de49f6a84ce2df GIT binary patch literal 15409 zcmd73WmsHU(=Hrbf&>i`pb75XxVyVGPH=ZzJM{u7lQ00cM~Tch&x16btE z>@6Uc01l`~6%_ygut-|jLd=|@zqUpYGjTH$2U9atK|xd(h_jiI9jZrqzK)E;f+T+1 zndUJql40==1cCwb&j4Y69#cs!8n|=msE;C+#;lB=oT0+&dGg^)-e&ac-!TG!Wml2&`tM3P$;BA9{2Z&8eKgFPt{64wWf z8t#E7C{^8Qn4ObK&7DS-^-ihbMBYl4hyyy z`}+|BuB)tk-d9Y=uTxQVXrv?MJsTQXt03~KR|?t2^<`l>Uc0%^IMy`Ca*sZ*A&C~% z8jd(K`_M`Evmy)Be=EqiKAZRPrT7M4#F#cdk&)14MgJ?(mk;tXiAe^}xMhfQyNchz z_nOBJi5BLXmw3(5E5*a)DHZ13U5UBuzU2$hCY2oa30)!xIe*UOFPG4&?X3N!E8?jv zS0kE*MsRStntaW6e?T>L-@aots~Wwnwp56Y_Bxx<6-0E5-W=5;m3Z{U7ap?$Eappb zIvgsua#||3;v3Zo$j}7%4ogdR-mxb;biXM@mfrl$(u}-Bqj`7GW}2QP;40o~L5LjNmm!m7I8-zhe&&&WI$iq716ajp;&{J;eHABzUze3EA zu)rc(C0^?ZqpidQQ1*zIjNoy+Mos_=K;*1SN3`>vg*Ryk3-^b0QPc@ngl=a(RXwLp zI*X+vCx&F&Y!RZ^>4Xlh=JFy)ttZnv25&Y;2);guw&0Sw80 z&56e?^>a?-xnozx9mWnUZfqAu6Tp-jyz?#h%+%#(`_f?p&_ItwBMq&3~Z%w=LFtwM1*XK4WP{QBW{Luvlkjm>5(E~X1`cfrX|&S>1ulT#Pn@x$JzRU z)`Ufe?~9$oL>B|v&2EnH(r5{Na#L?q_bNAn=!Fki0YtpAE&%-3Y9tyZIs} zilwup0e)vU0$=*03-TBj&Rox8=lYfBrI*kl83yL3q#wQHq=Mj=n%UIblC!gA&(UkR zhJv%hUCmW4%i$k}KWQ3#q7vE`fU`5H4kGd}8>&VyL3AdT4=8lNUoCA8QIj!Xj=Prt z6GhT|NR5-5u&SO8j?{`yIPAc?BRf0gQ_T{dq2uZIv9!i6rfU0GFzoF*$#}rjWhb9H z(VH_eGLs)C1{n=N43ku@XSJc&pQDgGrwU?Z# z8d_VN-%<|1#xeO&ha#$l^-0^t0(Ut?hDldc^tq_MiM?ds*~nZ%6XhivSlr&dBzBG* zEQU?>qZ)FYF&Zh;{dVka;&uV2T8WuV$H?YbN^@nFzHmhbiF72s1PC2w#-Qn2Z~WPi zD;IbMk~NOfWhslRUP9?uoLhdfYgu?`NN!ERVn|T%1=g!3EEQq$8MzF7pAH z6zgY|son-ci9@cG!)dacM*~_*BJNJ`)LX5#IR2)H$hQ&w$(u@G5p9*xyPEhaU%3dj z^&q&~In+!up%SLUyBWdeW6E-yyCy_R4jiUFGz{YY!uw06>>z(H zNfLv8W`d+yUHGu{I7wxB(#lt6*ua-(CQiB0V$aeOAV#9kUv)RJo54V$T6!rH%ruzP zz~dTJj4q-E?{xP!8OH)E`gHExbOd~6+XeZL$n4jjV!$H#bbz)$) z#F`5y<}tplR3k&)up*@>oN5zPJ+UG}^O33mO@OETWnc^#i3?mufaO$@i_Rv~A$JSRRAK0~%Ea(a^(_#YdVF?If?sgIFxy=)yd;$t;kV8)7 zmUlD+QiP9?t2o4R4&vG?X;^${bBpO%-Z13QvE=1wY31WSk0Oa~t`IjV6>+z$UQ2qZ z?d~&a;=CRwy?omDG~=0BUU3*zF0*(Nf)aRkqGmEd3}aRL4x(d=wrUXk0GE_Z)re|l zZ~8~n3_W{{w;zM-$6sie&C1Qe^Vs0A$o5w_{wn~N@PJ6EL7>4mfS>+k#&6$8qKmg~TA^qb3k0KC&>rWB1AN2jl{H%Y? z4`5OEbTk97C>vRr0jL0=U-~E;Ih#R;2mRTXMZwI}%1G3~1E32P0RbR(HXde9b|4QA z2moT`VCLfF;Njr`=>O_02GtDO+Xe85Ier5SDQ5>)$G>U&d&*zZKh0HxD&lNp@8bA4 zwTb7SVi^F$+12dNk{Gm=xS5-kiJ7XD=%ertXQ-OFIJi2Sn7IHR9jp2`L%E^lf8ZXT zzts9~xaZMxzX1{mz{A1D`X4}O)6>%j_iLL2_^zqRqRC=6bHLPqIt7i~HVg#@TUe4) zJDwN-_!5YSjOBz9O8bmkq=8aWSscJvC5{#f1A2)>L!e9vOEj_$bRY80`Z@ixWjIaIRTHeKc{g=*q=_erLQbzO>F;47$o~59tI@mxfRaAt z8K>)(Nwin*ozv{wE=G#C?{XV21=v3wnYbW_b#l&nF9(Ld^KHD9Cz-m$!ftf@@o_iE z?ed@yCG05Z^_{s%$m@{P7j;i7Gx&ezNIgGH#S32U779DKz{DR;SKL8(k@hV7Kmk+1 zfLqf`F*D>KaOTI*YtyNPfIdtu1#T(iECm8O>;zS;Pm2pkde(j|6W>z__5=h|Ic;Gp2+TXdi_ry{D_ zIKvjB2_sZ}TZnmJeI6GTE;djr>;tB9dxrqQYu)B4!5y~Krz#ISvLtX7sMzM$Z*DGw ztWh?l2#S*xq)A2Il3n75Vs=Tbv8=%fRSihX{7h1HD2A_Hr@07-FM?<4*aNUgeRIX= z)J=5ct{ZyJt|2ue4KF@dhR+1Uc!;(5er0?+m6i2(x>{DlQiYK(seNheTta; z>7+@dS()z-6_n(ah+?rrpLQW(!!8LM1L(Mzxdge$r_<7s)8f-uxy;P;JbgSx8|(R< z%*Lwez;uIjC;C}#uO}`cd7`cO7wjYfa^f z^3ul-Y7f;ZX7~vM$$qPbdn9&B6SxXOb!Xq^%g4a4Zzl5cYI3b&JKxee(UwxOa*Qar zX;Wl~N2&4GopQ6lo9umn1Q95POAK)^4lT?`%biP;q0)b-t*z~KOnF1ZzH@pZg04Uq z%m8bKo0yrN#?N&7F33w38ptyLPboiL}hy^h@fZ zYc&7;`C(HNd2KlUQP4Z8=(}@?d9s-w&+ACb>=`WP-31?}!tFLyzuf;cXB2Uzz`Rr0 z6*@hjj6g#c=V|zg!h#bxrP~J{G?sv?-(l{Nu3zFUf_Y#S-Lu=vCPY8|TuS;5mnbY# zypu%my_Y=UhYB|I8*U~nhMplI=Z)K;Q|C7Qmg<7C661wL5K+U=f%|s`lS?Q%-bH0o|59Z4dq;}4FSGP0xQJWU{To()%*!nYbLNx9Z4?+9)jmUHQondqo6NTa-82wONno8-k8v6)fVTH%q!sKSexVdF4Lu{ z>P;&ce_Ie{JyEtv*_1i-OgZHxe$7@jgjZ7*6?p;-?-A{0O>z;pmDWNh?`K7D)0TP| z|75@>{~$+#&Bawte>m(k=;VV;C>l~ho=FaBNs2X>ss5oFE(2mR|C+f*uwUaTT}=Zb z-TgkjyuQ1n95$3&M!lir5xB^9vD4f3 z`tou|0!bNflUv?ruHt0n{-A&{detuGyW|T6uRvSV2~kN|=^885U6|$}IR2#P$Biv1 z{SY0kf>l0zw7$c34%a=H?reUvSW?G+BCkZm&8Y0!S1M*tDWc+N|Xhc&(D zUluAzj0zA%-!4<$BNM~{o2I9d_V%1HKB4X7JVnx%Tpz82qrJ^dNe{6-1NOBOSxriR z+$B`DaM9U+DNvg6B{@>1YqLpUjQWFEPfF^M^6R)_FZkATF@%%;wbC^h%|m6MZvQCO7BA7i?7y~Dzn*$Nb#0OA!(Gi zM=?EjXUzTG5MTJYzV8^!V({UM22JA!&+07vTdWM zFI}`7!RACovhLC`O=p?TN>qkv=m>SFT6Ey!!cC4o{F$PO&vLi#Ps>j$PubWzhgGX_ z1?)&cwQ_7_6Cq1Z4N=$acJsremiBzwG|jq4g$8x{4)Z-ec3Cgmvnbltu01!no*knF z(T9OgXmC}{YAL@}3S$=)WD)D`52UGv1j50crV;U?p}M{yC!8Hn%0h zB>A@??`$4U0;Y-x37LhgNdqhVxm8yN_oLap+5`R#)?3Njw+Q zANOn8+N&b#ahDXY!*2R;@*a2GgY+HPq*hCp3_jtN4$HjozOZXH=SYEPSQ2&hp|yLN zlvXO=;5bE#x6kEUbq8x>XW0~;f`r+=@62&<-KOBF&xmf0{y31(HkMAOoH)7v2+c^&C(iKD0#Bq3invNl`@nLbsRqByH%*R}900rUY z*(MwUuoq6!;|Jt>_M);r|;G@F_SCMR>gkhmtp_8ArEln7WlX!$?$H=@M zP#ymvhc5I`AHusu6xw7!iT5lL<7vc3@W*elFtNsh4R6^3zYt8Ky(lKqNJzz005iUu ztk8;+&3}9IC>fDnhlla;=p{}37IkdKHOUA*AdK#q-vz#ti;69w0x2=$xWUM zZC9?>+Fszf{FWk(;!1kybX_nL5&wco?rgRWS3_nt@q{Ke+F(aVE;hGQSMXWUjGzYgek`__4A&b%)C!6Kj!X6F5vmLs zl3kmSr?)yMG{zkViFh~)lm;*Lr9#>p?B>IBGk@r{yB&DtfOf6>tEh+ywH9S#rc4N7Nre{kCKUL0AX?9@Br-BG+lL+Jt>3exfPeOD@>K9mF; zN-{9>5pI%VmDRr*+UbLQb;U6shf8IXjMm0Kvy}&LxSq%BiR$C}Ivr+>v_HudJ-ZNoJwlhiSx{ZMY zjNS!lqpz_KCOj+bbi^)Pte6eEcANn#)Dor^VAR@ZWLUPmh~lpCS~RLJ)*uZ1O2@;| z+W(%3{A|*m)qu&gjby6SKoWju0q0%}r<>-1`DJI#qIrsAxGjQr1Ci16fuMKd>zW&u zJ?@|C`|7SyI`#(U1{Utc@L?kj3NDsI$a_Xjd1uIzWsP>DQI-~{GTlY%2`&7SV~cy5 zakZwp?^;7XgpAvq(`d7Y=gaN-_|GZrTM!!#yP5KhN6t(8BWgW?RrN>O@W-w|tBq22 zsE>Awa6^pXXU&YVhy!T$N$_R8b@NTSV0Mk=>`grS=DtNFQ4>0-apb;m#0ZpSd`j{( z18;+FU)CeT9st4&aJ0Pb%JXIC3p&9`7N75JBW(0DohBfd6bD6|1kxGZXiwgLrg{DJU zV=`g@oSH9Pu%S}@8d}a;(k;0IFzF)I-S1@^pKoqH)!hk|P;h8pbb4b{z{3@!cu!Im zIVpAxKsS|-r9SKJGqwWxF&6O4#)RThNyvftf2f=$w*b6+2R>Bx%Va6Yu=F9w3-OM=5@KrU`bf?c3WL@QEMwW#B!Gk+vnim zh#s%y!L)mE=dn3rROrxqb(10VJ(ZZI4gFxd59Fs4c$;Q{-X@%^xA_uWSG%Px_K}U$ zSODbbJ|#Se+nbQG+M&MT9)et2XFnh8H($4yt#NV|NxWuZz1g@H!5z7YRzeZ?BR>Uf z+(wJD2`iHyzGHj?L3WuX;ErM^FxAUWvY!q$t)|D0W2{Iat{bFx{phM~ORr@HM+;nY zlTDqpOVYn-6iH%ZHV*s7=_13X&l7S#X;=VjZZC774yvZf983GLoxE|q%6;f?*pF2k z9Xj{ID{CfQ%!MqcaV+0GYS+IT#Kq{oF zK-=Tu@aIYyEgmPjCqY-#%=Z_s1fp78F9!(OLtqmuhayrq3LA{!{XZ|bL>m| zCK;knz7i{wyf?y5-1MYM3RlAR~Cy9F_G>z#*+i$;c9&MCSfeqaRn(@`7UvHX4Sv z-Iq$Wb1|>v1N*c#lb%s)4d)Nm2BQP!GbUrl^PZ8SUWhMFRdzX4hq-H|P4griNG2+& ztMEr4HlI^%m@0;9Poq8%4{Qvsrx_~KH>glGw}V{Oq*=%#K7GE95Hn?nqm4NaFBI$9 z&N*--54URW&`?h^5&AOk`*NVt;XPi2WXv~%;deLd3t8y`4DeBMVxoR12R)~7m3oDo z{r#*=QXJnd=%1>E8gJq^t2KApy-7*E)U{u`;^0CdvEytIEnYQ~$Gxy2EWjEUgHb9k z?1LL0a8cliN+ov7@@%p?q?u)@Wjo5VTKmv6H+LsxdbzbQ%Qjt8@I}g|yS4H8ZqpO* zE1KlOK1x>qMs_MCTH(ewx4{dS5J@*L+MgFHMHVPWVJgNG-i!fR4t#Q&VF(}HrA(1r zki=}ojn|I}qnjB47^<>!yW+eV%z1u!!<9JSbL-Q%7Sq-jlaeQv?C&uH)kjfko-DH* z_>cu3Rr|eu81}=RZji1YDTDC60%uI4v?z|MT`iruudh4$$QLC_$0{R@?jWh;)U9Wg zuQpoMJ`Az&KrD*d9mSP{u7HWV-%9Do&N1TIX|_<_$fk?GM9~TtUk{pp3Z*nl z%=HG)2QEdK1wHA-26lP?ku!p3jN`pzRj_&-cHUQoa2YxVBb!ASaCSnb z#2&^t-Uk0HJT|dJ-T2uzO^Ry9Q1$J_$AtHH8&7u86HL*EaMv^shacV<*e3`|WloP2 z&KmR*5-6(5#6Zx^A~B52-#E~cSPAZKPTHjSL5%DSZ*h-`Z@ZRS@5^^5@f?_X%W7Fp z5$ES!3JrfG8uXjkd~qmFxNRGLnC&0-z9dTH`>H$7Y3#B6kpeZ?WVwq@i$ixN@{S@n zWGN+l%30_>lDX(&DE;8ItY2T!S-c=ap*xanh4-7f5R3EHX`UUL*bx{}3zmqVjr-SU zJ!S6Pr)AG&kVIWqP?r)y#9xTUTwSjxorfgCDjd*vxqYR2EW*{|2`i!jC{(Ic%Z7JY8%yg@43ZyPW*B*F2IM*Q;GiNS%sk1 zD+%iL8Ck|n%aOpr%J;(7mFnyDKL;CUlk%QTU%!1VKDH6KAxdM{S;@Y-7o+u_CHYN0 zo~B3qGr^MTTZY8_hD+VZbmlK_)|q}jdt#?`oPi&nPr8a_X50gc644sM%GJbLvj$M2 zg%fW(8AvL3A9XnSa1W8)KPAVSAq%)bu4($#w)HS8Hg+aPx+`N&v9=}|V9S}$_T-r2_uBkz`P=Kcl-g)x9F|8kk7+}-K{swH2rF`tfh5; zgYJqDyiXjQ0cKDsY+Ii{LlVs?LxLkA(yL5l1K*hLcH01!u@x7XER=?Ln1^xnER7Bc zn5>4C<)n@vuGM9DpNLMOSE$>=i$lXKK`!;3v+Z5S)v)*2aAYV~)&7_WK7bW@s&xB^ z?Lq|h?#`ct$C>^(4CuZu@t4p;5Cg*4S}0bM{p%50s8)*nDPZ>yTIg0PU|h|7U(o=r z5D$CcTbwM3{MHpbOi~~_O1{39I@u%`2zf%+tJ(h346FYB)9@3h9^lXSq8?vie*U<* zZWEo+U0?IQQ(k|Q6tk10YZ&dH1pghsq90)r_SM4f3XAz?MWhyGWH5JL3AP*ZOxs|0 z{nHuqz_MYfld_=WTl@?mG6Y@7%*PID7-EJ4pAY`Be72vXIn_IETrt|(_&&OPc44ac zG0SzW12bHn(w+4hCsUug%(HuVT0W2iW|vTipULo|M9`I8Q*U74f47_f6Nez*kwAkU#E&VT z$D2ifg0lehl_wx$0cZyLVF3>nGoo=E3o+c03Bnz^Q3=>|woN(=A?X>>Q%3P5_RoP; z*pliUjxe2C4uT&Mg4Bn{6KjtAsQpLVQjG5y0twCa)5Y~PaAfT|m>n1&(t>nOP^g+yl;k?g3 zQ{cA6g8W?C^@P+{+EwHdSLn!>IPWSa&0u{oePtS;?XM&pi-R3AEdJc%DR~aUT1?QD zXlS(j@8KgG=qSON_P8fAU>mSzi3{m4MeHVv0eI{D?nuvOm zy~%66^DOq$%@(<$z#@H_>qRWZCWz!>u;=i}o@2^33$Ka}q-;(@ifHDrI4!FgXSalM z`G6uTy8j|-{2upcI;DngnV|-^Kzh6JX?!z;a&p48G|k8oFaPRjfH&`4?CN`!^;BlQ z>!1jSJs618jm6FSWB&~gP{^MQ9tViB^u&Q6t;rB$0O?r72hAtq_;dLRX@iCBXkY>< zBY&BPSGXR-jrYzG!UryzR?mqG)#+>{iq~E{tvl02&CIT=yItcBIkTwU@+|{hq7D)c zs}A9QNDqoW+3vYqMP_-40fls#SI=1<);{yCqHo!n4k4bgn`L4U^?1N#2I}n$5B5&> zVcZ|eH*5^-tK!#w53~K2yyKptNm*U|5$D|9@f8B``?7|BLOXL z=3?S(@94~0Ki}U zL|jZBiEAEK?q9`6{T|CqtZW>=1Y$;x(q>i`mQZdRI?OMT8pO;_1Hk>e|6@V=&%&dH z(0y#i3V39Z|IR8u^6Y=^{oh>nW82@uxf(-$5$ftt-ukibmsHfq#q1Yj|G#<0%Gm`X zW@+U7rvv1T{wRZ3e^c`R>29d{zf^kkx`aJ6Vb{vu0>GkaWiMjyV)dus*UXw$rVvXP zC~waOdQ1iQ`-_bS2zdN(0Dnu_IH6@|D>gPx0O#L6kF}5GzxVz7IVbo3)Nye=vi|@7 zU)*fJwc>;t#L31E;9%$Z^~(w30I+fVmT|JN0a&>n+i`I5{5s=-&hjYbgl^ZzUO)gl z^o#?#PoR7DR}B!r4Yh;=nxF9a`IVReO)zi(+gm+mHv@iMz5c$Up#RCp_>=#KW;s9y z{++<^-!5l=&lF(gf?gT_etnx%w^3K0##?f4^EUT43g%n)2fTUqVmTBeR|Q0ZBKcJc zEDdX=pwbpg-QlSEvi@?xTZST6y5!}1l`0=?RDpQ*)Nkx>q)q&0Zzbd; z84ZNXxW3?;;Cvy$HGJP+`4zPivNSbGOn$FlS&jRqixlfPm@8>YjGe%XhC` zC`mMQB!7ft_WR7f-zD6tb)%Z1vX}oT%Z{DEsq2y_^MhSVN5gz7G{&TMriO1oCN+^T ztzes@I-iHYXLoYTu$TS7%cc62&j$<|d-E0XgY)jB`XO4T(H{o!8o?YTh@&$})Slkw z6mog*LKP$Hsie4b)zGKaFQy@3>K2s}`cRdDyvd&9Q zrts7vg)QOa8C~xy_x&<|jf@ zYpT^1LmaVExIb?1x$%a(v5z41=E~i2mgFz0lk4cs8SB`U00*(Rm?jnHO_}k%o9oo} zpSgzAa7@VPW~Wh)Dwe73S$`<4K}0B+6#vG4E%Bg>U8+Ci@WDAly%1l*mWXj7;J8`X zrTLOFuXO0pGe|>j3wF-Vp0%0LDwb-Y((9net+yp?uLy3r8@r_xyl-{$^oFvebrJ?w zt*mPaU(n=-O=;ebOR-_V9K8)XxbOi5x=rbv#}J#o5QWya4Xt~PLUIo^?u&&DGfS( z{Vnx#BsE!CHS37f8W2Z!3ut%MT~6oHaD0eO%(|Qr3N*CV=J0{~(2I^qV^Tn-p()TnIpK@u%%23+d7b!HSdR84w>baDK1B>Hx zHyKPvrW_#xQ@Avmt-b1-2Tfm)8yy!TKk+xK_Ud&< z!;8<$uno8!24YuF*VhPq(gULj$h;g^ecGCyefD(X&x*!e8dfHId+dui1X?mlzgQV& zzNPbLi*V1lGNDqeU%`w}Tk|c!&Y+bAKuJuBPP>QhKwB^ec!uoc^G@%qh2q9C17zX<%*HUvyHIr~_ zV|LQ?G9uj(t$Z?{!7_}YA8xDJntp?ka1oktGs7N1Sf(ulGuxeVZY>SdWv|YS90Ju2 zTt9pX6L{&Rgvs`V`TIg7NHqvMS$Ijz!^(YLNzo5cZ4G zVj5W?%ZbvOFq->BR|4^t%@`dwiV&_nLRECau3>K@JkiKLyagqJEYRMGc>sy3373ue zKhx4js7jFIO0M2Fz`M31U4jl11}?X<0r=OcIj8HO{-6|3feHC;G={d`WWhn@al&yl zc{6#eeq6~VQ?bp9&l{9!skVl(=s1Lz@IZ2Zz3#;Vh^sp?4-O2cVuL`X{(pl7aXY#O1} z8*2rF(W}Wj-{tJaSQiDHOQW3_9s}x$zISE^C~|y@3{XVdInsdZ{?JZ*x1ce}f@sfh z!#YOb-UR^f$-BP%8X`zeDTmT4NI5&A%R7@6Lm>L;1tqeyVb{+QHQvcMESiut>}UOs zATMD0+KMT6o`w`gglyN9p~Xz@dVC}n)6%F@1#2#vIP1$UrZ+}TU%m%2OX_F(Yau^P z{8XGHzdM5VOw4A?XOg()%L8WM{WR?TzVr=JZqm=-{}L2G1_FNs#ULJT&|gvUpV^&% zd#VcbR}SX?@v$m><^HHn09wl#o?u-F^g>5FR?@mrfyptC2{2U6LwqlWas3ma&^o^s zehSDoW$1ax(&~<~fr%hs2)o}xuGb+;WYpE`slsKLuBg;ip9#igO$)J;;l5l#9IXR< zO3}l%bG!l?b$xj9-6jGveUzq1d^$iRt><;gRlYfV@0OYa>(Q87 zUtRs0dpI)sx>s~w;1??T$h_OPxn<6-BXfEg!VdQ*gNHw6oxchfN{lCHs zaT!rLMdkksc)|WFCVq^i{~tmZzhck-2uA;&wfeu|3o#k-M>`&?e*>BS!5IJDJkXFvh^l_wLXzZ{2XV=Hv>Ax8W`tu&jqsNt<9ZX$KppRS9niyF* zI{-k;AWmi=fYuTMadhEjfu?#xr54Ucj+Ry?F3b+j7IeSyu^9AWOo)|(z4&9AIITD@ zE07h6j9FPhtnBQpJPbe(4G>88w@IO3=$~qf89@QFgT*gz{4(K>4DbIj)x+7$92E$F zVou;+2Y{1FaU2m}KDOUBLt{P(t%>Czhz(-8`PZ{^W#sdMl*`RLw&%Pjzf7uIS z<7EG*ZBQ8}bU*!PU$%ey0K~?_^G`cK?5zLp%gzP8(eaP**twzj_%9j6*$8@j!ui)j zy=qooW{>Ok_~%u1aDV`Qt^e;o;Nu@2@M{zPy2`jfjGQ6ARvN_417t;|qLNUOME(B& D94!Xr literal 0 HcmV?d00001 diff --git a/manuscript/ms.tex b/manuscript/ms.tex index b17ca8f..74a62ea 100644 --- a/manuscript/ms.tex +++ b/manuscript/ms.tex @@ -63,7 +63,7 @@ Not few users of hearing aids keep complaining about the limited benefit of their devices in noisy environments. Recently, in an approach to model human speech recognition by means of a re-purposed automatic speech recognition system, the loss of class D was explained by introducing a level uncertainty which reduces the individual accuracy of spectro-temporal signal levels. Based on this finding, an implementation of a patented dynamic range manipulation scheme (PLATT) is proposed, which aims to mitigate the effect of increased level uncertainty on speech recognition in noise by expanding spectral modulation patterns in the range of 2 to 4\,ERB. - An objective evaluation of the benefit in speech recognition thresholds in noise with the framework for auditory discrimination experiments suggests that more than half of the class D loss due to an increased level uncertainty might compensable. + An objective evaluation of the benefit in speech recognition thresholds in noise using an ASR-based speech recognition model suggests that more than half of the class D loss due to an increased level uncertainty might be compensable. \end{abstract} \keywords{theoretical audiology, speech perception modeling, impaired hearing, hearing loss compensation} \end{@twocolumnfalse} @@ -73,9 +73,9 @@ \section*{Introduction} \label{sec:intoduction} % -To this day, hearing aids without directional amplification or noise suppression provide their users only with limited benefit in noisy listening conditions. +To this day, hearing aids without directional amplification or directional noise suppression provide their users only with limited benefit in noisy listening conditions. % -The limited benefit of (compression) amplification in noisy listening conditions is long known and was extensively described and put into context by \cite{plomp1978}. +The limited benefit of hearing aids in noisy listening conditions is long known and was extensively described and put into context by \cite{plomp1978}. % There, the effect of impaired hearing on speech recognition performance was described as a sum of two fundamentally different classes of hearing loss: class A, which accounts for an attenuation of the signal, and class D, which accounts for a distortion of the signal. % @@ -96,12 +96,12 @@ \section*{Introduction} The FADE approach was already successfully used to predict the outcomes of several speech in noise recognition experiments \citep{schaedler2015,schaedler2016b} as well as the outcomes of basic psycho-acoustic experiments \citep{schaedler2016a} for listeners with normal hearing. % \cite{kollmeier2016} proposed to remove the information that is not available to an individual listener with impaired hearing in the feature extraction stage of the ASR system used in the FADE modeling approach. -% + To induce a class A loss in the model, variations in the internal spectro-temporal signal levels below the individual hearing threshold, determined by the individual audiogram, were removed. % This manipulation is illustrated in the center panel of Figure~\ref{fig:1}, where the low-energy portions (blue/green) were replaced by constant values which are equal to the individual absolute hearing threshold, while the high-energy portions above the individual absolute hearing threshold are unchanged compared to unmodified representation in the upper panel. % -\begin{figure}[h] +\begin{figure} \centerline{\includegraphics[width=0.5\textwidth]{images/kollmeier-LOGMS-A-D}} \caption{Figure reproduced from \cite{kollmeier2015}. Internal spectro-temporal signal representation (log Mel-spectrogram) like it is used in the FADE modeling approach of a speech in noise mixture (upper panel) and examples of manipulations to it that were introduced to induce a class A hearing loss (center panel) and a class D hearing loss (lower panel).} @@ -109,18 +109,18 @@ \section*{Introduction} \end{figure} % It seems plausible that, if all relevant signal portions are above the hearing threshold, this manipulation should have no effect on the predicted speech recognition performance. -% + To induce a class D loss in the model, random values were drawn from a normal distribution and added to the internal spectro-temportal signal levels, where the standard deviation of the normal distribution was a variable called \emph{level uncertainty}. % -This manipulation is illustrated in the lower panel in Figure~\ref{fig:1}, where all signal portions, including those above the hearing threshold are affected. +This manipulation is illustrated in the lower panel in Figure~\ref{fig:1}, where all signal portions, including those above the hearing threshold, are affected. % -Because the signal energy in that representation (which is a logarithmically scaled Mel-spectrogram) is represented in a logarithmic domain, amplification cannot be expected to change its effect on the predicted speech recognition performance. +Because the signal energy in that representation (which is a logarithmically scaled Mel-spectrogram) is represented in a logarithmic domain, linear amplification cannot be expected to change its effect on the predicted speech recognition performance. \cite{kollmeier2016} evaluated the effect of these manipulations on the predicted outcomes of the German matrix sentence test in a stationary and a fluctuating noise condition for different noise levels, and fitted the A/D-class description proposed by \cite{plomp1978} to the data. % The results, reproduced in Figure~\ref{fig:2}, clearly show that the two manipulations largely achieved the intended effects, that is, inducing a class A and a class D hearing loss. % -\begin{figure*}[h] +\begin{figure*} \centerline{\includegraphics[width=0.4\textwidth]{images/kollmeier-FADE-A}\includegraphics[width=0.4\textwidth]{images/kollmeier-FADE-D}} \caption{Figures reproduced from \cite{kollmeier2015}. Simulated speech recognition thresholds with FADE for a stationary and a fluctuating noise condition at different noise levels. @@ -135,7 +135,7 @@ \section*{Introduction} % The right panel shows FADE simulations with increasing values for the level uncertainty, which do not converge for high noise levels, that is, the manipulations \emph{cannot} be compensated by amplification as one would expect from a class D loss. % -An important observation of \cite{kollmeier2015} was, that their empirical data set, which included matrix sentence test results in noise of almost 200 ears, could not be satisfactorily predicted with the class A loss only, indicating that an implementation of mechanism that induces a class D loss is needed to explain the speech recognition performance of individual listeners. +An important observation of \cite{kollmeier2015} was, that their empirical data set, which included matrix sentence test results in noise of almost 200 ears, could not be satisfactorily predicted with the class A loss alone, indicating that an implementation of a mechanism that induces a class D loss is needed to explain the speech recognition performance of individual listeners. \cite{schaedler2020a} extended that approach by inferring the individual frequency-dependent level uncertainty from tone in noise detection thresholds, and achieved unprecedented accuracy in the prediction of benefits in SRTs due to different traditional hearing loss compensation schemes in noise (and in quiet). % @@ -156,7 +156,7 @@ \section*{Introduction} The aim of this contribution is to: \begin{itemize} \item[A)] Present an approach which is able to partially compensate a class D loss as implemented with the level uncertainty in FADE, and - \item[B)] Objectively evaluate this approach and come up with testable quantitative hypothesis on the benefit in noisy listening conditions. + \item[B)] objectively evaluate this approach and come up with testable quantitative hypothesis on the benefit in noisy listening conditions. \end{itemize} For an effective mitigation of the effect of the level uncertainty on speech recognition performance, three main problems need to be addressed: @@ -170,13 +170,13 @@ \section*{Introduction} % Let us assume that a log Mel-spectrogram, such as it used for the calculation of the widely used Mel-frequency cepstral coefficient (MFCC) features, is representative for the information that is available to an ASR system. % -The spectral resolution of such a log Mel-spectrogram, of which an Example is depicted in the upper panel of Figure~\ref{fig:1}, is about 1 ERB, the temporal resolution is 10\,ms. +The spectral resolution of such a log Mel-spectrogram, of which an Example is depicted in the upper panel of Figure~\ref{fig:1}, is about 1 equivalent rectangular bandwidth (ERB), the temporal resolution is 10\,ms. % The relevant speech information is encoded in the represented spectro-temporal dynamic, that is, the differences of spectro-temporal signal levels over time, called temporal modulations, and over frequency, called spectral modulations. % It is remarkable that ASR systems traditionally don't even use the whole information in the log Mel-spectrogram, but work with a reduced spectral resolution compared to the spectral resolution of the human auditory system. % -For example, \emph{the} standard features used for ASR, MFCCs \citep{etsi2007}, specifically encode spectral modulation frequencies from $0$ to about $\frac{6}{23}$ cycles per ERB; spectral modulation frequencies above $\frac{1}{4}$ cycles per ERB empirically don't contribute to speech recognition performance. +For example, \emph{the} standard features used for ASR, MFCCs \citep{etsi2007}, specifically encode spectral modulation frequencies from $0$ to about $\frac{6}{23}$ cycles per ERB; spectral modulation frequencies above $\frac{1}{4}$ cycles per ERB empirically don't contribute to automatic speech recognition performance. % In line with this finding, the robust Gabor filter bank (GBFB) features use only slightly more than half of the available spectral resolution of the log Mel-spectrograms \cite[c.f.][]{schaedler2012}. % @@ -208,11 +208,11 @@ \section*{Introduction} % In the logarithmic level domain (in the log Mel-spectrogram) the noise of the level uncertainty is additive, and one effective way of mitigating the effect of an additive noise is amplification. % -This means, that the desired spectral modulation patterns should be amplified, that is, expanded. +This means, that the desired spectral modulation patterns should be amplified, that is, expanded before the noise of the level uncertainty can remove that information. At a first glance, an expansion of signal dynamic in the context of the reduced residual dynamic range of listeners with impaired hearing might seem completely undesirable: % -The expansion of spectral modulations can result in uncomfortably high and/or inaudibly soft signal levels; which an occur even at the same time for different frequency ranges. +The expansion of spectral modulations can result in uncomfortably high and/or inaudibly soft signal levels; which can occur at the same time in different frequency ranges. % However, considering the scale (2 to 4 ERB) it is clear, that this is a region which is not modified by common approaches to multi-band dynamic compression, where the signal is usually independently compressed in approximately six bands. % @@ -224,7 +224,7 @@ \section*{Introduction} % The dynamic range of speech in noise is less than the dynamic range of a clean speech signal. % -Especially when considering speech in only sightly fluctuating noises at signal-to-noise ratios (SNRs) around 0\,dB, the dynamic range of the mixture can be close to the minimum which is required to discriminate words. +Especially when considering speech in only slightly fluctuating noises at signal-to-noise ratios (SNRs) around 0\,dB, the dynamic range of the mixture can be close to the minimum which is required to discriminate words. % In this condition, no improvement can be expected from compressing the signal dynamic. % @@ -244,15 +244,15 @@ \section*{Introduction} % Because of the highly non-linear nature of the interactions between the factors influencing speech recognition performance (speech material, masker type and level, reverberation, non-linear signal processing, hearing impairment), an implementation that already fulfills the most basic requirements of a hearing aid algorithm and can run on a hearing aid prototype\footnote{\url{https://github.com/m-r-s/hearingaid-prototype}} was preferred over a simple proof-of-concept implementation. % -This additional effort makes a seamless translation to an application in a hearing device more likely and increases the meaningfulness of the presented results for a possible final hearing device. - +This additional effort makes a seamless translation to an application in a hearing device more likely and increases the meaningfulness of the presented results for a possibly realizable hearing aid solution. + For an evaluation of the implementation with respect to a possible compensation of a class D loss, the following points need to be considered: \begin{itemize} \item[1)] Which listening conditions, that is, which speech test and maskers, are suitable to evaluate the PLATT implementation objectively with FADE \emph{and} (also later) empirically. \item[2)] Which listener profiles are suited to clearly demonstrate a (partial) compensation of a class D loss like it is implemented in FADE. \end{itemize} -The first point is important to enable the verification of any hypotheses that are based on the model predictions. +The first point is important to enable the verification with empirical data of any hypotheses that are based on the model predictions. % \cite{schaedler2020a} discussed this point and proposed to use the SRT-50 measured with the matrix sentence test in quiet, in a stationary, and in a fluctuating noise condition, to cover the very different masker properties in typical listening conditions: Quiet, low-dynamic maskers, high-dynamic masker. % @@ -262,7 +262,7 @@ \section*{Introduction} % That means, low measurement errors in the empirical data may facilitate or even enable falsification of the model predictions. % -An SRT of 0\,dB at high noise levels in the test-specific noise condition, this is with a stationary noise of identical long term spectrum than the speech signal, can be considered very problematic when normal-hearing listeners can achieve about -8\,dB. +An SRT of 0\,dB at high noise levels in the test-specific noise condition, this is, with a stationary noise of identical long term spectrum than the speech signal, can be considered very problematic when normal-hearing listeners can achieve about -8\,dB. % If only half of the hearing loss in that condition in noise could be compensated (which would be a huge achievement), the measurable benefit would be only 4\,dB. % @@ -270,35 +270,37 @@ \section*{Introduction} % Such low measurement errors, which can be achieved in SRT measurements with the matrix sentence test, would later enable to show individual benefits without averaging over groups of listeners, given the benefit was 4\,dB. % -When adding to this consideration the need for a level-dependent evaluation required to identify the class D loss according to \cite{plomp1978}, one arrives at the test conditions which were already used in \cite{kollmeier2015} and that are depicted in Figure~\ref{fig:2}. +When adding to this consideration the need for a level-dependent evaluation, which is required to identify the class D loss according to \cite{plomp1978}, one arrives at the test conditions which were already studied in \cite{kollmeier2015} and that are depicted in Figure~\ref{fig:2}. The second point, the selection of suitable listener profiles, is a bit more complex than it might initially appear. % A sensible approach would be to take the individual profiles inferred from the psychoacoustic measurements by \cite{schaedler2020a} which are available online\footnote{\url{https://doi.org/10.5281/zenodo.4394186}}. % -The main problem with that approach is, that even with a small purely class A loss, the SRTs are generally not level-independent at high levels. +The main problem with this approach is, that even with a small pure class A loss, the SRTs are generally not level-independent at high levels. % This can already be observed for the fluctuating noise condition in the left panel of Figure~\ref{fig:2}. % -There, the simulations with a purely class A loss with hearing thresholds according to the standard profile \emph{N1} (corresponding to a very mild hearing loss) do not converge to the data with the normal-hearing profile \emph{None} up to noise levels of 90(!)\,dB SPL. +There, the simulations with a pure class A loss with hearing thresholds according to the standard profile \emph{N1} (corresponding to a very mild hearing loss) do not converge with the data of the normal-hearing profile \emph{None} up to noise levels of 90\,dB SPL. % Hence, even for very small increases in hearing threshold, amplification improves the SRT in the fluctuating noise condition up to very high presentation levels. % With the aim of clearly attributing compensation strategies to compensate a class A or a class D loss, this is highly undesirable. % -To clearly identify the compensation of a class D loss, linear amplification alone must \emph{not} improve the SRT. +To clearly identify the compensation of a class D loss, simple linear amplification alone must not improve the SRT. % -The reason for the found behavior is that the frequency range above the hearing threshold increases with level for the Bisgaard profiles. +The reason for the observed model behavior is that, for the Bisgaard profiles, the frequency range above the hearing threshold increases with the presentation level. % -While the limited frequency range can safely be assumed a factor contributing to a class D loss and has to be considered in a suitable listener profile, it must be avoided that the used frequency range still changes at high presentation levels. +While the limited frequency range can safely be assumed a factor contributing to a class D loss and has to be considered in a suitable listener profile, it must be avoided that the effectively used frequency range changes at high presentation levels. % -One option would be to low-pass filter the speech material, another option is to define profiles with very steep hearing loss functions. +One option would be to low-pass filter the speech material. % -The former would be very suitable for individual predictions for which empirical data can be measured. +Another option is to define profiles with very steep sloping hearing loss functions. +% +The former option would be very suitable to measure empirical data. % The latter option is regarded cleaner from a modeling perspective, because it reduces the number of parameters that influence the SRT and results in a simpler and possibly better traceable model. % -Hence, listener profiles with normal hearing thresholds below and \emph{infinite} hearing loss above a given limit frequency are suitable for the considerations in this contribution. +Hence, listener profiles with normal hearing thresholds below, and infinite hearing loss above a given limit frequency are suitable for the considerations in this contribution. % While the measurements from \cite{schaedler2020a} indicate that the level uncertainty might be frequency-dependent, profiles with frequency-dependent level uncertainty would add an additional dimension to a already complex matter. % @@ -306,7 +308,7 @@ \section*{Introduction} % Hence, for the purpose of demonstrating the effectiveness of the compensation approach with PLATT and to discover its interactions and limits, idealized profiles that can be described by only two parameters, the frequency limit and (frequency-independent) level uncertainty, are preferable. % -In later measurements with human listeners, the limited frequency range can still be achieved by low-pass filtering the signals to match the considered conditions. +In later measurements with human listeners, the limited frequency range can be achieved by low-pass filtering the signals to match the considered conditions. With these considerations the following steps are enabled: % @@ -322,12 +324,12 @@ \section*{Introduction} %% METHODS \section*{Methods} \label{sec:methods} -The methods described in the following were used to simulate speech recognition experiments in stationary and fluctuating noise at different presentation levels for 16 listening profiles with class D hearing losses without and with the proposed dynamic range expansion by PLATT including different degrees of expansion. +The methods described in the following were used to simulate speech recognition experiments in stationary and fluctuating noise at different presentation levels for 16 listening profiles with class D hearing losses without and with the later proposed dynamic range expansion by PLATT including different degrees of expansion. \subsection*{Speech recognition tests} \label{sec:matrixtests} % -The speech material of the (male) German matrix sentence test \citep{wagener1999,kollmeier2015} was used with two masker signals: The test-specific noise (called OLNOISE) and the fluctuating ICRA5-250 \citep{dreschler2001,wagener2006} noise signal. +The speech material of the (male) German matrix sentence test \citep{wagener1999,kollmeier2015} was used with two masker signals: The test-specific noise (called OLNOISE) and the fluctuating ICRA5-250 noise signal \citep{dreschler2001,wagener2006}. % The matrix test, which exists in more than 20 languages, comprises 50 phonetically balanced common words of which sentences with a fixed syntax, such as \enquote{Peter got four large rings} or \enquote{Nina wants seven heavy tables}, are built. % @@ -341,15 +343,15 @@ \subsection*{Speech recognition tests} % At the SRT for normal hearing listeners, -7\,dB \citep{hochmuth2015}, this results in a noisy speech signal with a low dynamic range, where spectro-temporal maxima of the mixtures are dominated by the speech signal. % -The effect can be observed in Figure~\ref{fig:3}, where the log Mel-spectrogram of clean speech signal (upper panel) and the same speech signal with the OLNOISE masker at -7\,dB SNR (center panel) are depicted.oc +The effect can be observed in Figure~\ref{fig:3}, where the log Mel-spectrogram of clean speech signal (upper panel) and the same speech signal with the OLNOISE masker at -7\,dB SNR (center panel) are depicted. % -\begin{figure*}[h] +\begin{figure*} \centerline{\includegraphics[width=1.0\textwidth]{images/speech-noise}} \caption{Log Mel-spectrograms of a clean German matrix sentence at 65\,dB SPL (upper panel), of the same sentence in the stationary noise (center panel) and fluctuating noise (lower panel) conditions at the SNRs which correspond to the SRT listeners with normal hearing of speech, -7 and -19\,dB, respectively.} \label{fig:3} \end{figure*} % -The ICRA5-250 noise is a speech-shaped noise which is modulated with speech patterns in three bands, where the pause duration was limited to 250\,ms \citep{wagener2006}. +The ICRA5-250 noise is a speech-shaped noise which is co-modulated with speech-like temporal patterns in three independent frequency bands, where the pause duration was limited to 250\,ms \citep{wagener2006}. % The empirical SRTs with this masker signal are usually more than 10\,dB lower than in the corresponding test-specific noise condition \citep{hochmuth2015}. % @@ -357,19 +359,19 @@ \subsection*{Speech recognition tests} % This can be observed in the lower panel of Figure~\ref{fig:3}, where the log Mel-spectrogram of a speech signal with the ICRA5-250 masker at -19\,dB SNR is depicted in the lower panel. % -Both maskers are considered at presentation levels from 0 to 100\,dB SPL in 10-dB steps, to assess the level dependency. +To assess the presentation level dependency, both maskers are considered at presentation levels from 0 to 100\,dB SPL in 10-dB steps. % At 0\,dB SPL presentation level, this corresponds to listening in quiet. % The selected listening conditions reflect important dimensions of speech perception: Listening in quiet, at low levels, and at high levels, as well as listening in stationary and fluctuating noise. % -All considered speech tests could be performed human listeners. +All considered speech tests can also be performed human listeners. %% FADE \subsection*{Simulations of matrix tests with FADE} \label{sec:fade} % -The speech tests considered in Section~\nameref{sec:matrixtests} were simulated with an ASR-based approach and their outcome, the SRT-50, was predicted. +The speech tests considered in Section~\nameref{sec:matrixtests} were simulated with an ASR-based approach and their outcome, the SRT-50, was predicted based on the simulation results. % The simulations were performed with the latest standard version of FADE\footnote{\url{https://doi.org/10.5281/zenodo.4003779}}, as described by \cite{schaedler2016a}. % @@ -377,7 +379,7 @@ \subsection*{Simulations of matrix tests with FADE} % Hence, the FADE simulation method is only outlined here, and we refer the interested reader to the original description from \cite{schaedler2016a}. -Predictions with FADE are performed completely independently for each listening condition (masker, maskerlevel, hearing loss compensation, hearing profile). +Predictions with FADE are performed completely independently for each listening condition (masker, maskerlevel, hearing loss compensation, and hearing profile). % There is no dependency on any empirically measured SRT, nor on predictions of the same model in other/reference conditions (and hence no need to define such). % @@ -389,19 +391,19 @@ \subsection*{Simulations of matrix tests with FADE} % For this, a corpus of noisy speech material at different SNRs was generated from the clean matrix sentence test material and the masker signal, by adding randomly chosen masker signal fragments with the speech material. % -The noisy signal were processed with PLATT when an aided listening condition was considered. +The noisy signals were processed with PLATT when an aided listening condition was considered. % -From the noisy (processed) speech signals, features were extracted, where this step included the implementation of the class D hearing loss, as described in Section~\nameref{sec:listenerprofiles}. +From the noisy (and optionally processed) speech signals, features were extracted, where this step included the implementation of the class D hearing loss, as described in Section~\nameref{sec:listenerprofiles}. % Subsequently, an ASR system using whole-word models implemented with Gaussian Mixture Models and Hidden Markov Models, was trained on the features. % This resulted in 50 whole-word models for each training SNR. % -These models were then used with a language model that considers only valid matrix sentences ( $10^5$ possible matrix sentences) to recognize test sentences on a broad range of SNRs with noisy speech material form the same considered condition. +These models were then used with a language model that considers only valid matrix sentences (of which $10^5$ exist) to recognize test sentences on a broad range of SNRs with noisy speech material form the same considered condition. % For each combination of a training SNR and a test SNR, the transcriptions of the test sentences were evaluated in terms of the percentage of correctly recognized words. % -The resulting recognition result map (cf. Figure~7 in \cite{schaedler2020a} for an example), which contained the speech recognition performance of the ASR system depending on the training and testing SNRs in 3\,dB steps, was queried for the SRT. +The resulting recognition result map (cf. left panel in Figure~7 in \cite{schaedler2016a} for an example), which contained the speech recognition performance of the ASR system depending on the training and testing SNRs in 3\,dB steps, was queried for the SRT. % For a given target recognition range, e.g. 50\%, the lowest SNR at which this performance was achieved was interpolated from the data in the recognition result map and reported as the predicted SRT for the considered condition. % @@ -411,30 +413,32 @@ \subsection*{Simulations of matrix tests with FADE} \subsection*{Listener profiles: Class D hearing losses} \label{sec:listenerprofiles} % -Predictions of speech recognition tests as well as basic psychoacoustic tests with FADE were found to be close to the empirical results for listeners with normal hearing. +Outcome predictions of speech recognition tests as well as basic psychoacoustic tests with FADE were found to be close to the empirical results for listeners with normal hearing \citep{schaedler2016a}. % As proposed by \cite{kollmeier2016} and successfully used by \cite{schaedler2020a}, impaired hearing was implemented in the ASR system by removing the information from the feature vectors that is presumably not available to listeners with impaired hearing. % -As discussed in the \nameref{sec:intoduction}, two types of manipulations which induces class D hearing loss were considered: 1) A limitation of the frequency range, 2) An increase of level uncertainty. +As discussed in the \nameref{sec:intoduction}, two types of manipulations which induce class D hearing loss were considered: 1) A limitation of the frequency range, 2) An increase of level uncertainty. % The effect of both parameters on the log Mel-spectrogram of a clean speech sample is depicted in Figure~\ref{fig:4}, where in the upper panel, the frequency range was limited to 8000\,Hz and the level uncertainty was 1\,dB. % \begin{figure*}[h] \centerline{\includegraphics[width=1.0\textwidth]{images/leveluncertainty-bandwidth}} - \caption{Illustration of the class-D-loss-inducing log Mel-spectrogram manipulations: Log Mel-spectrograms for listener profiles \enquote{P-8000-1} (upper panel), \enquote{P-8000-7} (center panel), and \enquote{P-2000-7} (lower panel). + \caption{Illustration of the two considered class-D-loss-inducing log Mel-spectrogram manipulations: Log Mel-spectrograms for listener profiles \enquote{P-8000-1} (upper panel), \enquote{P-8000-7} (center panel), and \enquote{P-2000-7} (lower panel). The first number in the profile encodes the upper frequency limit, in this example 8000 and 2000\,Hz, and the second number indicates the level uncertainy, here 1 and 7\,dB.} \label{fig:4} \end{figure*} % In the center panel, the level uncertainty was increased to 7\,dB, compared to the upper panel. % -And in the lower panel, the frequency range was limited to 2000\,Hz compared to the center panel. +In the lower panel, the frequency range was additional limited to 2000\,Hz compared to the center panel. % -As an amplification of input signal would increase all values in the the shown log Mel-spectrograms by the same value, both manipulations introduce a level-independent loss of information and hence induce a class D loss. +An amplification of the input signal increases all values in a log Mel-spectrograms by a constant value. +% +Both manipulations introduce a level-independent loss of information and hence induce a class D loss. % For the evaluation, upper frequency limits of 1000, 2000, 4000, and 8000\,Hz were considered, where the class D loss decreases with high values. % -And frequency-independent values of 1, 7, 14, and 21\,dB for the level uncertainty were considered, where the class D loss increases with high values. +For the level uncertainty, frequency-independent values of 1, 7, 14, and 21\,dB were considered, where the class D loss increases with high values. % All combinations of both parameters result in 16 profiles from \enquote{P-8000-1} to \enquote{P-1000-21}. % @@ -446,11 +450,11 @@ \subsection*{PLATT dynamic range manipulation} % In this section, the patented (DE 10 2017 216 972) PLATT dynamic range manipulation as it was conceived for a later implementation in a hearing device is described. % -The implementation was optimized to run in real-time on a Raspberry Pi 3B to enable field studies with mobile hearing aid prototype hardware\footnote{For example: \url{https://github.com/m-r-s/hearingaid-prototype}\\ or \url{https://batandcat.com/portable-hearing-laboratory-phl.html}}. +The implementation was optimized to run in real-time on a Raspberry Pi 3 Model B to enable field studies with mobile hearing aid prototype hardware\footnote{For example: \url{https://github.com/m-r-s/hearingaid-prototype}\\ or \url{https://batandcat.com/portable-hearing-laboratory-phl.html}}. % The ability to expand spectral modulation frequencies in the range of $\frac{1}{8}$ to $\frac{1}{4}\frac{\text{cycles}}{\text{ERB}}$ is a feature that integrates naturally with the approach. % -Even if not strictly necessary for the goals of this contribution, the method is described here in detail to make statements about its ability to compensate a class D loss in the algorithmic context in which it might be later used in a hearing device. +Even if not strictly necessary for the goals of this contribution, the method is described here in detail to make statements about its ability to compensate a class D loss in the algorithmic context in which it might be later usable in a hearing device. % To motivate the design decisions behind PLATT, which generally aims to preserve relevant speech modulations when compressing the dynamic range of a signal, this subsection comes with its own introductory part. @@ -458,13 +462,13 @@ \subsection*{PLATT dynamic range manipulation} % Conditions in which the available dynamic range for acoustic communication is reduced are rather the norm than the exception. % -For example, in a driving car, the lower limit of the available dynamic range is given by the driving noise. +For example, in a driving car, the lower limit of the available dynamic range is determined by the driving noise. % Or in a library, the upper limit is given by the accepted sound levels in such an environment. % And, importantly, the available dynamic range for communication is limited for listeners with impaired hearing. % -For successful communication, it may be required to adapt a source signal, which may contain speech and non-speech parts, to the available dynamic range on the receiver side by dynamic range compression. +For a successful communication, it may be required to adapt a source signal, which may contain speech and non-speech parts, to the available dynamic range on the receiver side by dynamic range compression. % But, in many real-time applications, the available temporal context to perform this operation is very limited. @@ -472,14 +476,15 @@ \subsection*{PLATT dynamic range manipulation} % The compression is often applied with rather short attack time constants, e.g., 20\,ms, with the aim to protect the user from high levels, while the release time constants are usually much longer, e.g. 100\,ms to 1000\,ms, with the aim to limit compression when it not desirable, i.e. during short speech pauses. % -However, no distinction is made whether the signal contains speech or not. +However, no distinction is made whether the signal contains speech portions or not. % Approaches which depend on a classification whether or not speech is present in the input signal, are prone to errors if the (speech-)signal-to-noise ratio (SNR) is low, that is, just when the classification result is most important. % Approaches that require more than a few milliseconds of future temporal context cannot be used in applications which require low latency, such as, e.g., hearing devices. % -Regarding the speech intelligibility of processed signals, compression in a few wide frequency bands is preferred over compression in many narrow frequency bands, however, the recommended number of channels greatly varies (usually between 1 and 8) \citep{plomp1988,dreschler1992,hohmann1995,yund1995,moore1999,souza2002}; -The fewer channels are used, the better the spectral dynamic, i.e., the spectral contrast or \emph{spectral modulation}, is preserved. +Regarding the speech intelligibility of processed signals, compression in a few wide frequency bands is preferred over compression in many narrow frequency bands, however, the recommended number of channels greatly varies (usually between 1 and 8) \citep{plomp1988,dreschler1992,hohmann1995,yund1995,moore1999,souza2002}. +% +The fewer channels are used, the better the spectral dynamic, that is, the spectral contrast or \emph{spectral modulation}, is preserved. % Static dynamic range compression only preserves the spectral modulation within each independent frequency band, but not across bands, even if the dynamic range would be available. % @@ -493,13 +498,13 @@ \subsection*{PLATT dynamic range manipulation} % However, the often employed basis for the feature extraction stages, the log Mel-spectrogram, is not suited for low-latency signal processing due to its long integration window. % -The relatively long integration window of the log Mel-spectrogram serves two objectives: 1) Obtain a sufficiently high frequency resolution to separate low-frequency signal content into 1\,ERB-wide bands, and 2) Ensure that in voiced speech portions each signal frame contains at least one pulse (that is to remove the temporal fine-structure of the speech signal). +The relatively long integration window of the log Mel-spectrogram serves two objectives: 1) Obtain a sufficiently high frequency resolution to separate low-frequency signal content into approximately 1\,ERB-wide bands, and 2) Ensure that in voiced speech portions each signal frame contains at least one pulse (that is to remove the temporal fine-structure of the speech signal). % -Fortunately, these two aspects, sufficient spectral resolution for low frequencies and limited temporal resolution, are compatible can be optimized for low-latency processing at the cost of a frequency-dependent group delay. +Fortunately, these two aspects (sufficient spectral resolution for low frequencies and limited temporal resolution) are compatible and can be optimized for low-latency processing at the cost of a frequency-dependent group delay. % In the following, the design of PLATT, a fast adaptive dynamic range manipulation scheme that takes the mentioned observations into account, is proposed, where the following three objectives were pursued: \begin{itemize} - \item Preservation and enhancement of spectral modulations which relevant for speech recognition + \item Preservation and enhancement of spectral modulations which are assumed to be relevant for speech recognition \item Low-latency and fast reaction time while minimizing audible artifacts \item Adaptive limitation of the compression to the necessary minimum \end{itemize} @@ -513,6 +518,18 @@ \subsection*{PLATT dynamic range manipulation} \item[3)] Adaptive calculation of frequency- and time-dependent gains from the spectro-temporal representation which uses compression only as required to provide high speech recognition performance when the available dynamic range on the output side is limited. \end{itemize} +Figure~\ref{fig:5} illustrates the relations between the signal processing blocks that were used to implement this functionality. +% +\begin{figure*} + \centerline{\includegraphics[width=1.0\textwidth]{images/platt-diagramm}} + \caption{Diagram illustrating the relations of the main signal processing blocks which were used to implement the signal analysis, manipulation, and re-synthesis with PLATT.} + \label{fig:5} +\end{figure*} +% +A detailed description is provided in the following. +% +The exact implementation details are provided in a reference implementation that is written in C (cf. Section~\nameref{sec:ressources}). + \paragraph{Frequency decomposition \& re-synthesis} % The frequency decomposition of the input signal is performed with a filter bank of fourth-order Gammatone filters. @@ -525,16 +542,16 @@ \subsection*{PLATT dynamic range manipulation} % The -10\,dB-bandwidth of each fourth-order Gammatone filter is chosen to be equal to the difference of the frequencies two positions right and left to its center frequency, e.g., $221\,\text{Hz}-93\,\text{Hz}=128\,\text{Hz}$ for 155\,Hz. % -The aim is to evenly cover the relevant frequency range with filters that have a bandwidth similar to auditory filters ($\approx1\,\text{ERB}$) and allow a trivial re-synthesis in the time domain by summation of the filter bank outputs. +The aim is to evenly cover the relevant frequency range with filters that have a bandwidth similar to auditory filters ($\approx1\,\text{ERB}$) and allow a trivial re-synthesis in the time domain by simple summation of all filter bank outputs. % With this goal, the filter coefficients are determined as follows: % -The pole in the complex z-plane that describes the frequency-dependent properties of a first-order infinite impulse response Gammatone filter is calculated according to the empirically determined formula +The pole in the complex z-plane that describes the frequency-dependent properties of a first-order infinite impulse response Gammatone filter is calculated according to the formula \begin{equation} p = \left(1-\frac{1}{\sqrt{2} \cdot \frac{10000}{\text{bw}} + 0.5}\right) \cdot \exp\left(2\pi \text{i} \frac{\text{f}_\text{c}}{\text{f}_\text{s}}\right), \end{equation} % -where bw is the -10\,dB-bandwidth and $\text{f}_\text{c}$ the center frequency of the corresponding fourth-order filter, and $\text{f}_\text{s}$ the sampling frequency in Hz. +where bw is the -10\,dB-bandwidth in Hz, $\text{f}_\text{c}$ the center frequency of the corresponding fourth-order filter, and $\text{f}_\text{s}$ the sampling frequency. % The phase of the single FIR coefficient of each filter is chosen such that the phases of each pair of fourth-order filters with neighboring center frequencies were identical at the delay where the product of their respective temporal envelopes reaches its maximum. % @@ -544,24 +561,24 @@ \subsection*{PLATT dynamic range manipulation} % Together, a) evenly covering frequencies, b) avoiding destructive interference, and c) normalizing the maximum gain result in a very flat frequency response for the sum of all filter bank channels. % -Figure~\ref{fig:5} shows the first 11\,ms of the real part of the impulse responses of a subset of the Gammatone filters and also the (scaled) sum over the real parts of all impulse responses. +Figure~\ref{fig:6} shows the first 11\,ms of the real part of the impulse responses of a subset of the Gammatone filters and also the (scaled) sum over the real parts of all impulse responses. % -\begin{figure}[h] +\begin{figure} \centerline{\includegraphics[width=.85\columnwidth]{images/gammatone_filter_responses}} \caption{Real part of the impulse responses of a subset of the normalized, phase-adjusted, fourth-order Gammatone filters that were employed for the frequency decomposition, and the (scaled) sum of the impulse responses of all employed Gammatone filters.} - \label{fig:5} + \label{fig:6} \end{figure} % The joint impulse response (sum of the real-valued impulse responses of all normalized, phase-adjusted, forth-order Gammatone filters) is a downward frequency-sweep. % -The frequency-dependent delay can be read from Figure~\ref{fig:5} and is about $2.5\,\text{ms}$ at $2\,\text{kHz}$ and about $4.5\,\text{ms}$ at $800\,\text{Hz}$. +The frequency-dependent delay can be read from Figure~\ref{fig:6} and is about $2.5\,\text{ms}$ at $2\,\text{kHz}$ and about $4.5\,\text{ms}$ at $800\,\text{Hz}$. % -Figure~\ref{fig:6} shows the corresponding absolute values of the transfer functions of the same sub-set of filters and the absolute values of the transfer function corresponding to the joint impulse response of all filters. +Figure~\ref{fig:7} shows the corresponding absolute values of the transfer functions of the same sub-set of filters and the absolute values of the transfer function corresponding to the joint impulse response of all filters. % -\begin{figure}[h] +\begin{figure} \centerline{\includegraphics[width=.65\columnwidth]{images/gammatone_filter_transfer}} - \caption{Absolute values of the transfer functions corresponding to the impulse responses shown in Figure~\ref{fig:5} and the absolute value of the joint transfer function of all filters (including those not shown).} - \label{fig:6} + \caption{Absolute values of the transfer functions corresponding to the impulse responses shown in Figure~\ref{fig:6} and the absolute value of the joint transfer function of all filters (including those not shown).} + \label{fig:7} \end{figure} % The joint transfer function of all 78 employed Gammatone-filters, which characterizes the system property after re-synthesis if the amplitudes are not manipulated, has a flat frequency response between about $150\,\text{Hz}$ and $13\,\text{kHz}$. @@ -572,7 +589,7 @@ \subsection*{PLATT dynamic range manipulation} % The filter bank output can be manipulated directly, e.g., multiplied with a time- and frequency-dependent gain function, prior to the re-synthesis. % -The rate of change of the gain functions is limited to 24\,dB per period of the corresponding center-frequency to limit channel cross talk. +The rate of change of the gain functions is limited to 24\,dB per period of the corresponding center frequency to limit channel crosstalk. \paragraph{Spectro-temporal signal representation} % @@ -582,7 +599,7 @@ \subsection*{PLATT dynamic range manipulation} % This approach approximately extracts the temporal envelope of each channel while preserving fast increases in amplitude (on-sets). % -Hence, the exact timing (or temporal fine structure) is removed from this representation, and only the local maximum values remain as an estimate of the maximum amplitude (or displacement) of a oscillatory system with properties similar to those of the employed Gammatone filters. +Hence, the exact timing (or temporal fine structure) is removed from this representation, and only the local maximum values remain as an estimate of the maximum amplitude (or displacement) of an oscillatory system with properties similar to those of the employed Gammatone filters. % This representation can be down-sampled by any factor which reduces the sample rate to $\frac{1}{15\,\text{ms}}\approx67\,\text{Hz}$ or higher, without missing any local maximum value. % @@ -592,12 +609,12 @@ \subsection*{PLATT dynamic range manipulation} % The use in a hearing device, however, requires faster updates which is why the representation is down-sampled to $1000\,\text{Hz}$, that is, an update of the 78 spectral values is calculated every $1\,\text{ms}$. % -In Figure~\ref{fig:7}, the spectro-temporal representation used in PLATT and the log Mel-spectrogram of a clean speech sample at 65\,dB SPL are shown. +In Figure~\ref{fig:8}, the spectro-temporal representation used in PLATT and the log Mel-spectrogram of a clean speech sample at 65\,dB SPL are shown. % -\begin{figure*}[h] +\begin{figure*} \centerline{\includegraphics[width=1.0\textwidth]{images/logms-plattstr}} \caption{Comparison of spectro-temporal representations: In the upper panel a log Mel-spectrogram as it is used in FADE of a clean speech sample at 65dB SPL, and in the lower panel the presented specto-temporal representation that is used in PLATT of the same sentence.} - \label{fig:7} + \label{fig:8} \end{figure*} % Compared to the log Mel-spectrogram, the proposed spectro-temporal representation has a 10-times higher temporal resolution (visible at the on-sets), where, however, the temporal fine structure is effectively removed. @@ -614,33 +631,35 @@ \subsection*{PLATT dynamic range manipulation} % This effectively removes spectro-temporal modulations that cannot be perceived by listeners with normal hearing from the proposed spectro-temporal signal representation. % -The effect can be observed in the high frequency range ($>8000$) in Figure~\ref{fig:7}, where the speech signal has no energy. +The effect can be observed in the high frequency range ($>8000$\,Hz) in Figure~\ref{fig:8}, where the speech signal has no energy. \paragraph{Adaptive spectral gain} % -The adaptive determination of time- and frequency-dependent gains takes into account the current spectral input dynamic, the current available output dynamic, and aims to minimize the compression with the constraint to avoid the masking signal parts which could carry important (speech) information. +The adaptive determination of time- and frequency-dependent gains takes into account the current spectral input dynamic and the currently available output dynamic. +% +It aims to minimize the compression with the constraint to avoid masking the signal parts which could carry important (speech) information. % It also allows to expand the spectral modulations that are assumed to be important for speech recognition and trade the such increased signal dynamic against an increased compression of less relevant signal dynamic. The spectral input dynamic is analyzed with spectral modulation low-pass filters. % -For this, each vector of the $78$ spectral values is convolved with Hanning windows of the following width to obtain spectrally smoothed versions of the initial vector: $8$, $16$, $32$, and $64$, which approximately correspond to a full width at half maximum (FWHM) of $2$, $4$, $8$, and $16\,\text{ERB}$, respectively. +For this, each vector of the $78$ spectral values is convolved with Hanning windows of the following widths to obtain increasingly spectrally smoothed versions of the initial vector: $8$, $16$, $32$, and $64$, which approximately correspond to a full width at half maximum (FWHM) of $2$, $4$, $8$, and $16\,\text{ERB}$, respectively. % -The left panel of Figure~\ref{fig:8} shows an example of the spectral analysis for a signal which consists of two pure tones, of $500\,\text{Hz}$ and $2000\,\text{Hz}$. +The left panel of Figure~\ref{fig:9} shows an example of the spectral analysis for a signal which consists of two pure tones, of $500\,\text{Hz}$ and $2000\,\text{Hz}$. % -\begin{figure*}[h] +\begin{figure*} \centerline{\includegraphics[width=.80\textwidth]{images/details-dynamic-layers}} \caption{Example dynamic mapping of two pure tones. Left panel: Input dynamic analysis with spectral modulation low-pass filters. Center panel: (Conditional) reconstruction with reduced dynamic. Right Panel: Gains required to map the input dynamic to the reconstruction stages of the output dynamic (final gains are black). Numbers indicate the differences.} - \label{fig:8} + \label{fig:9} \end{figure*} % The input spectral values are depicted in black, the smoothed versions with ascending widths of the Hanning window in increasingly lighter shades of gray. % The differences between the curves of increasingly smoothed spectral representations are indicated with the digits 1 to 4. % -Because the difference of two linear low-pass filters is a band-pass filter, the differences 1, 2, 3, and 4 can be interpreted as the result of a spectral modulation band-pass filtering which roughly contains the respective spectral modulation frequencies 1) above $\frac{1}{4}$, 2) from $\frac{1}{4}$ to $\frac{1}{8}$, 3) from $\frac{1}{8}$ to $\frac{1}{16}$, and 4) from $\frac{1}{16}$ to $\frac{1}{32}$\,$\frac{\text{cycles}}{\text{ERB}}$, respectively. +Because the difference of two low-pass filters is a band-pass filter, the differences 1, 2, 3, and 4 can be interpreted as the result of a spectral modulation band-pass filtering which roughly contains the respective spectral modulation frequencies 1) above $\frac{1}{4}$, 2) from $\frac{1}{4}$ to $\frac{1}{8}$, 3) from $\frac{1}{8}$ to $\frac{1}{16}$, and 4) from $\frac{1}{16}$ to $\frac{1}{32}$\,$\frac{\text{cycles}}{\text{ERB}}$, respectively. % -By definition, the original vector of spectral values (cf. black line in left panel of Figure~\ref{fig:8}) can be recovered by adding the differences 1 to 4 to the low-pass filtered spectral values which contain the spectral modulation frequencies below $\frac{1}{16}$\,$\frac{\text{cycles}}{\text{ERB}}$ (cf. lightest line in left panel of Figure~\ref{fig:8}). +By definition, the original vector of spectral values (cf. black line in left panel of Figure~\ref{fig:9}) can be recovered by adding the differences 1 to 4 to the low-pass filtered spectral values which contain the spectral modulation frequencies below $\frac{1}{16}$\,$\frac{\text{cycles}}{\text{ERB}}$ (cf. lightest line in left panel of Figure~\ref{fig:9}). % In the following, the vector of low-pass filtered spectral values which contains the lowest modulation frequencies is referred to as the \emph{base layer}. % @@ -656,11 +675,11 @@ \subsection*{PLATT dynamic range manipulation} % Here, for the input, a dynamic range that is probably relevant for normal hearing listeners is assumed: A frequency-independent uncomfortable level of $105\,\text{dB~SPL}$ as the upper limit, and levels of $25\,\text{dB~SPL}$ from $500\,\text{Hz}$ to $4\,\text{kHz}$, and $30\,\text{dB}$ above $8\,\text{kHz}$ and below $250\,\text{Hz}$, as the lower limit. % -The assumed input dynamic range is indicated with triangles in the left panel in Figure~\ref{fig:8}. +The assumed input dynamic range is indicated with triangles in the left panel in Figure~\ref{fig:9}. % For the output, an exemplary reduced dynamic range is assumed, which is arbitrarily limited to frequency-independent levels from $50$ to $90\,\text{dB~SPL}$; resembling elevated hearing thresholds due to environmental noise or impaired hearing and a lower acceptance of high levels. % -The targeted output dynamic range is indicated with triangles in the center panel in Figure~\ref{fig:8}. +The targeted output dynamic range is indicated with triangles in the center panel in Figure~\ref{fig:9}. % The mapping of the base layer can be independent from the input and output dynamic range definitions. % @@ -668,7 +687,7 @@ \subsection*{PLATT dynamic range manipulation} % The defined output dynamic range is the reservoir that can be used by PLATT to map the input dynamic. -In our example in Figure~\ref{fig:8}, let's assume the base layer was mapped linearly from the defined input dynamic range to the defined output dynamic range. +In our example in Figure~\ref{fig:9}, let's assume the base layer was mapped linearly from the defined input dynamic range to the defined output dynamic range. % The base layer is depicted as the lightest gray line in the left panel, and the mapped base layer as the lightest gray line in the center panel. % @@ -680,7 +699,7 @@ \subsection*{PLATT dynamic range manipulation} % However, unconditionally adding the whole dynamic that is encoded in the differences 1 to 4 could result in output levels below the lower limit of the output dynamic range, which might not contribute to speech recognition anymore, or above the upper limit of the output dynamic range, which might lead to undesirably high output levels. % -A good compromise in the fundamental conflict that too much and too little compression can both result in sub-optimal speech recognition performance requires a compression management which depends on the current spectral input dynamic and the current available output dynamic. +A good compromise in the fundamental conflict that too much and too little compression can both result in sub-optimal speech recognition performance requires a compression management which depends on the current spectral input dynamic and the currently available output dynamic. To prefer spectral patterns which are important for robust (automatic) speech recognition, the differences corresponding to high spectral modulation frequencies are added first. % @@ -688,11 +707,11 @@ \subsection*{PLATT dynamic range manipulation} % The example with two pure tones is an extreme one which assesses the maximum dynamic that can be encoded in each difference, which is about 6\,dB for difference 1. % -The corresponding output dynamic, when only adding difference 1 to the base layer, can be observed in the center panel of Figure~\ref{fig:8} as the light gray curve which only deviates slightly from the base layer. +The corresponding output dynamic, when only adding difference 1 to the base layer, can be observed in the center panel of Figure~\ref{fig:9} as the light gray curve which only deviates slightly from the base layer. % Probably the most important spectral modulation frequencies describe spectral patterns between $2$ and $4\,\text{ERB}$ and are mainly encoded in difference 2, which is why it is also added unconditionally. % -To protect this difference, which encodes a maximum dynamic of less than 9\,dB, against a hearing loss of class D such as implemented in FADE with the level uncertainty, it can be expanded by a factor greater than 1 prior to adding it to the base layer. +To protect this difference, which encodes a maximum dynamic of less than 9\,dB, against a hearing loss of class D as implemented in FADE with the level uncertainty, it can be expanded by a factor greater than 1 prior to adding it to the base layer. % The expansion of the difference 2 could increase the total output signal dynamic, which however can often be compensated by an increased compression of the remaining differences 3 and 4, which encode larger part of the signal dynamic, compared to difference 2. @@ -712,13 +731,13 @@ \subsection*{PLATT dynamic range manipulation} % The final desired spectral output levels are described by the sum of the mapped base layer, the unconditionally added differences 1 and 2, and the conditionally compressed differences 3 and 4. % -The frequency-dependent gain needed to achieve the desired spectral output levels is the difference of the spectral output levels (black curve in the center panel in Figure~\ref{fig:8}) and spectral input levels (black curve in the left panel in Figure~\ref{fig:8}). +The frequency-dependent gain needed to achieve the desired spectral output levels is the difference of the spectral output levels (black curve in the center panel in Figure~\ref{fig:9}) and spectral input levels (black curve in the left panel in Figure~\ref{fig:9}). % -The final frequency-dependent gain is plotted in black in the right panel of Figure~\ref{fig:8} along with the partial gains that would theoretically be needed after the cumulative addition of only differences 1 to 3 to the base layer in increasingly darker gray shades. +The final frequency-dependent gain is plotted in black in the right panel of Figure~\ref{fig:9} along with the partial gains that would theoretically be needed after the cumulative addition of only differences 1 to 3 to the base layer in increasingly darker gray shades. % -The final frequency-dependent gain is then applied to the output of the Gammatone filter bank with the initially described limitation of the rate of change to avoid audible cross talk between channels. +The final frequency-dependent gain is then applied to the output of the Gammatone filter bank with the limitation of the rate of change to 24\,dB per period of the corresponding center-frequencies. -Admittedly, in our example with the two sinusoids, there is only energy at the corresponding frequencies and hence only the level of the two sinusoids will be changed, while the gains at other frequencies will have no effect. +Admittedly, in our example with the two pure tones, there is only energy at 500\,Hz and 2000\,Hz, and hence only the level of the two tones will be changed, while the gains at other frequencies will have no effect. % However, this signal creates a pattern of extreme spectral modulation in the proposed signal analysis, and hence is well suited to illustrate how PLATT works. % @@ -728,22 +747,22 @@ \subsection*{PLATT dynamic range manipulation} % With PLATT, compression is only applied if the available output dynamic range is less than required to represent the input signal dynamic. % -The main effect can be observed in Figure~\ref{fig:9}, where the prescribed gains for \emph{high} (solid curves) and \emph{reduced} (dotted curves) spectral input dynamic are shown for an exemplarily reduced output dynamic range, as indicated by the triangles. +The main effect can be observed in Figure~\ref{fig:10}, where the calculated gains for \emph{high} (solid curves) and \emph{reduced} (dotted curves) spectral input dynamic are shown for an exemplarily reduced output dynamic range, as indicated by the triangles. % -\begin{figure*}[h] +\begin{figure*} \centerline{\includegraphics[width=.8\textwidth]{images/details-dynamic}} - \caption{Examples of prescribed spectral gain for an example with reduced available output dynamic range as indicated by the triangles. - Left panel: Spectral input dynamic with two pure tones at different presentation levels (gray shaded solid curves), and with white noise added (dotted curve). - Center panel: Corresponding prescribed spectral output levels. - Right panel: Corresponding gains.} - \label{fig:9} + \caption{Examples of calculated spectral gain for an example with reduced available output dynamic range as indicated by the triangles. + Left panel: Spectral input dynamic with two pure tones at different presentation levels (gray shaded solid curves), and with white noise added (dotted curve). + Center panel: Corresponding spectral output levels. + Right panel: Corresponding gains.} + \label{fig:10} \end{figure*} % -While the signals with high spectral dynamic are compressed (observed here as different gains for different frequencies) to make relevant signal portions audible and avoid excessively high output levels, the signal with low spectral dynamic is not (observed here as similar gains for different frequencies), because it fits in the available output dynamic range. +While the signals with high spectral dynamic are compressed (observed here as different gains for different frequencies) to make relevant signal portions audible and avoid excessively high output levels, the signal with low spectral dynamic is not (observed here as similar gains for different frequencies), because it fits in the available output dynamic range without compression. % Only low spectral modulation frequencies, which are less important for speech recognition, are conditionally compressed, while the important higher spectral modulation frequencies can be even expanded. % -An efficient implementation of spectral modulation filtering in the time-domain is possible with reasonably low latency, fast reaction times, and probably little audible artifacts due to signal decomposition and re-synthesis. +An efficient implementation of spectral modulation filtering in the time-domain is possible with reasonably low latency, fast reaction times, and probably little audible artifacts due to the auditory-motivated signal decomposition and re-synthesis. \paragraph{Compensation of level uncertainty with PLATT} % @@ -757,12 +776,12 @@ \subsection*{PLATT dynamic range manipulation} Expansion factors of 2, 4, 6, and 8 were considered for the evaluation, and the corresponding compensation conditions are referred to as PLATT-2 to PLATT-8 in the remainder of the manuscript. % -The effect of processing noisy speech signals at 0\,dB SNR with PLATT-6 is illustrated in Figure~\ref{fig:10}. +The effect of processing noisy speech signals at 0\,dB SNR with PLATT-6 is illustrated in Figure~\ref{fig:11}. % -\begin{figure*}[h] +\begin{figure*} \centerline{\includegraphics[width=1\textwidth]{images/signal-processed}} \caption{Illustration of the effect of the dynamic range manipulation with PLATT-6: Log Mel-spectrograms of noisy speech in stationary (left column) and fluctuating noise (right column) at 0\,dB SNR (top row), the same log Mel-spectrograms with level uncertainty of 7\,dB (center row), and log Mel-spectrograms with level uncertainty of 7\,dB of the same signals processed with PLATT-6, that is with an expansion factor of 6.} - \label{fig:10} + \label{fig:11} \end{figure*} % In the top row, log Mel-spectrograms of a speech signal in the stationary and fluctuating noise at 0\,dB SNR are depicted in the left and right panel, respectively. @@ -795,7 +814,7 @@ \subsection*{Evaluation of simulation results} % On the one hand, this depiction allows to assess the effect of the (noise) presentation level on the predicted SRTs. % -On the other hand, it also allows to quantify improvements in SRT in aided conditions over a given reference condition, e.g., normal hearing o unaided condition. +On the other hand, it also allows to quantify improvements in SRT in aided conditions over a given reference condition, e.g., normal hearing or an unaided condition. % Because not all 1760 data points can presented in graphical form in this contribution, a summary of the achieved improvements in SRT at high presentation levels, at which we can confidently assume that linear time-invariant amplification cannot improve the SRT, are presented in the form of a table for all listener profiles and compensations. % @@ -810,13 +829,13 @@ \subsection*{Availability of resources} % FADE\footnote{\url{https://doi.org/10.5281/zenodo.4003779}} version 2.4.0, which is open source sofware, was used for the FADE simulations. % -The code and scripts for the setting up the simulations were based on, and are now integrated into the measurement and prediction framework\footnote{??UPDATE-BEFORE-PUBLICATION!!}. - +The code and scripts for the setting up the simulations were based on, and are now integrated into the measurement and prediction framework\footnote{\url{https://doi.org/10.5281/zenodo.4500810}}. +% This includes: \begin{itemize} \item The modified feature extraction. \item The reference implementation of PLATT and the used configuration files - \item The scripts which prepare and run the FADE simulations using the. modified feature extraction and the reference implementation of PLATT. + \item The scripts which prepare and run the FADE simulations using the modified feature extraction and the reference implementation of PLATT. \item The scripts which evaluate raw the experimental results and plot the results figures. \end{itemize} @@ -836,7 +855,7 @@ \subsection*{Effect of limit frequency and level uncertainty} % Two modifications were used to implement a hearing loss of class D, the limitation of the frequency range up to a limit frequency, and the increase of the level uncertainty. % -Their separate effect on simulated SRTs is shown in Figure~\ref{fig:11} and Figure~\ref{fig:12}, respectively. +Their separate effect on simulated SRTs is shown in Figure~\ref{fig:12} and Figure~\ref{fig:13}, respectively. % \begin{figure*}[h!] \centerline{\includegraphics[width=\textwidth]{images/SRTs-frequencyrange}} @@ -844,28 +863,28 @@ \subsection*{Effect of limit frequency and level uncertainty} The dotted lines indicate an SNR of 0\,dB. The corresponding level-dependent differences in SRT compared the profile plotted in black (here profile P-8000-1) are depicted in panels two and four. } - \label{fig:11} + \label{fig:12} \end{figure*} % \begin{figure*}[h!] \centerline{\includegraphics[width=\textwidth]{images/SRTs-leveluncertainty}} - \caption{Effect of increasing the level uncertainty (to 7, 14, and 21\,dB). Analog to Figure~\ref{fig:11}.} - \label{fig:12} + \caption{Effect of increasing the level uncertainty (to 7, 14, and 21\,dB). Analog to Figure~\ref{fig:12}.} + \label{fig:13} \end{figure*} % \begin{figure*}[h!] \centerline{\includegraphics[width=\textwidth]{images/SRTs-leveluncertainty-with-frequencyrange}} - \caption{Effect of increasing the level uncertainty when the frequency range is limited to 1000\,Hz. Analog to Figure~\ref{fig:11}.} - \label{fig:13} + \caption{Effect of increasing the level uncertainty when the frequency range is limited to 1000\,Hz. Analog to Figure~\ref{fig:12}.} + \label{fig:14} \end{figure*} % \begin{figure*}[h!] \centerline{\includegraphics[width=\textwidth]{images/SRTs-mixedprofiles}} - \caption{Effect of mixed profiles with increasing level uncertainty and frequency range limitation. Analog to Figure~\ref{fig:11}.} - \label{fig:14} + \caption{Effect of mixed profiles with increasing level uncertainty and frequency range limitation. Analog to Figure~\ref{fig:12}.} + \label{fig:15} \end{figure*} -The limitation of the available frequency range affected the simulated SRTs in the fluctuating noise condition much more than in the stationary noise condition when assuming a level uncertainty of 1\,dB (cf. Figure~\ref{fig:11}). +The limitation of the available frequency range affected the simulated SRTs in the fluctuating noise condition much more than in the stationary noise condition when assuming a level uncertainty of 1\,dB (cf. Figure~\ref{fig:12}). % In the stationary noise condition, the limitation of the frequency range to 4000\,Hz did not have an effect on the simulated outcome of the German matrix sentence test. % @@ -881,7 +900,7 @@ \subsection*{Effect of limit frequency and level uncertainty} % At these levels, the increase in SRT was about 3, 6, and 15\,dB for a limitation to 4000, 2000, 1000\,Hz, respectively. -The increase of the level uncertainty also affected the simulated SRTs in the fluctuating noise conditions more than in the stationary noise condition (cf. Figure~\ref{fig:12}). +The increase of the level uncertainty also affected the simulated SRTs in the fluctuating noise conditions more than in the stationary noise condition (cf. Figure~\ref{fig:13}). % The effect was not as level-independent at low presentation levels as one might have expect. % @@ -891,7 +910,7 @@ \subsection*{Effect of limit frequency and level uncertainty} % However, in this contribution, the focus does not lie on the interactions of the level uncertainty with the absolute hearing threshold but effective elimination of the absolute hearing threshold as a factor from the evaluation. % -In the fluctuating noise condition, this is the case for presentation levels of 70\,dB SPL and above, where the differences (cf. panel four in Figure~\ref{fig:12}) stabilize. +In the fluctuating noise condition, this is the case for presentation levels of 70\,dB SPL and above, where the differences (cf. panel four in Figure~\ref{fig:13}) stabilize. % At these levels, the increase in SRT was about 4, 9, and 12\,dB for level uncertainties of 7, 14, and 21\,dB, respectively. % @@ -901,11 +920,11 @@ \subsection*{Effect of limit frequency and level uncertainty} % In listeners with impaired hearing, we assume that both factors, the level uncertainty as well as the limited frequency range, contribute to the class D loss. % -All combinations of both parameters were considered, of which a few are presented in Figure~\ref{fig:13} and \ref{fig:14} to assess their interaction. +All combinations of both parameters were considered, of which a few are presented in Figure~\ref{fig:14} and \ref{fig:15} to assess their interaction. % -In Figure~\ref{fig:13}, the simulation results for increased values of level uncertainty are shown when the frequency range was limited to 1000\,Hz. +In Figure~\ref{fig:14}, the simulation results for increased values of level uncertainty are shown when the frequency range was limited to 1000\,Hz. % -Comparing the results to Figure~\ref{fig:12} (limitation to 8000\,Hz), the limitation of the frequency range increased the effect of the level uncertainty in the stationary noise condition, while the effect in the fluctuating noise condition remained similar. +Comparing the results to Figure~\ref{fig:13} (limitation to 8000\,Hz), the limitation of the frequency range increased the effect of the level uncertainty in the stationary noise condition, while the effect in the fluctuating noise condition remained similar. % In the stationary noise condition, the average increase in SRT at levels above 70\,dB was about 3, 7, and 11\,dB for level uncertainties of 7, 14, and 21\,dB, respectively. % @@ -917,15 +936,15 @@ \subsection*{Effect of limit frequency and level uncertainty} % This is due to the strongly non-linear nature of the (automatic) speech recognition process, which can be interpreted to involve a forward error correction scheme that integrates over frequency and over time, and which needs to fail in order to achieve an error rate as high as 50\%. % -Forward error correction depends on redundancy, which is reduces by limited the frequency range. +Forward error correction requires redundancy, which is reduced by limiting the frequency range. % -This idea is further elaborated in the discussion section, but the results clearly show that considering level uncertainty and a limitation of the frequency range separately would not show the full picture. +The simulation result shows that considering level uncertainty and a limitation of the frequency range separately would not show the full picture. To graphically present effect of the PLATT compensation on the simulated SRTs, the mixed profiles P-4000-7, P-2000-14, and P-1000-21 were used. % This choice reflects the assumption that the limitation of the frequency range (due to the audiogram) and an increase in level uncertainty are probably correlated to some extent. % -The simulations results with these listener profiles without PLATT compensation are shown in Figure~\ref{fig:14}. +The simulations results with these listener profiles without PLATT compensation are shown in Figure~\ref{fig:15}. % Compared to the normal-hearing configuration, in the stationary noise condition, the average increases in SRT at levels above 70\,dB were about 2, 6, and 13\,dB with the profiles P-4000-7, P-2000-14, and P-1000-21, respectively. % @@ -933,7 +952,7 @@ \subsection*{Effect of limit frequency and level uncertainty} \subsection*{Effect of PLATT expansion} % -The effect of the expansion with PLATT on the simulation results for listener profiles P-4000-7, P-2000-14, and P-1000-21 is presented in Figures~\ref{fig:15}, \ref{fig:16}, and \ref{fig:17}, respectively. +The effect of the expansion with PLATT on the simulation results for listener profiles P-4000-7, P-2000-14, and P-1000-21 is presented in Figures~\ref{fig:16}, \ref{fig:17}, and \ref{fig:18}, respectively. % \begin{figure*}[h!] \centerline{\includegraphics[width=\textwidth]{images/SRTs-platt-P-4000-7}} @@ -941,19 +960,19 @@ \subsection*{Effect of PLATT expansion} The dotted lines indicate an SNR of 0\,dB. The corresponding level-dependent differences in SRT compared the profile plotted in black (here the unaided profile P-4000-7) are depicted in panels two and four. The data with profile P-8000-1 (normal hearing) was added as an orientation for normal-hearing performance.} - \label{fig:15} + \label{fig:16} \end{figure*} % \begin{figure*}[h!] \centerline{\includegraphics[width=\textwidth]{images/SRTs-platt-P-2000-14}} - \caption{Simulated \enquote{Plomp curves} for listener profile P-2000-14 without and with PLATT. Analog to Figure~\ref{fig:15}.} - \label{fig:16} + \caption{Simulated \enquote{Plomp curves} for listener profile P-2000-14 without and with PLATT. Analog to Figure~\ref{fig:16}.} + \label{fig:17} \end{figure*} % \begin{figure*}[h!] \centerline{\includegraphics[width=\textwidth]{images/SRTs-platt-P-1000-21}} - \caption{Simulated \enquote{Plomp curves} for listener profile P-1000-21 without and with PLATT. Analog to Figure~\ref{fig:15}.} - \label{fig:17} + \caption{Simulated \enquote{Plomp curves} for listener profile P-1000-21 without and with PLATT. Analog to Figure~\ref{fig:16}.} + \label{fig:18} \end{figure*} % The blue, red, and yellow lines indicate the simulated speech recognition performance with compensations PLATT-2, PLATT-4, and PLATT-6, respectively. @@ -962,7 +981,7 @@ \subsection*{Effect of PLATT expansion} The expansion with PLATT improved the SRTs in almost all simulated conditions. % -The benefit in SRT due to PLATT in was generally higher in the fluctuating noise condition that in the stationary noise condition. +The benefit in SRT due to PLATT in was generally higher in the fluctuating noise condition than in the stationary noise condition. % Higher expansion factors strongly tended to result in higher benefits, however, not in all conditions. % @@ -982,9 +1001,9 @@ \subsection*{Effect of PLATT expansion} % Hence, \enquote{stabilized} refers to reaching a state which may allow to assume that the remaining variability is due to the random error. % -In the fluctuating noise condition, the random error is increased, because of the generally shallower slope of the psychometric function, which can also be observed in the plotted +In the fluctuating noise condition, the random error is increased, because of the generally shallower slope of the psychometric function in this condition. % -There, the curve of the normal-hearing profile stabilizes at levels of about 70\,dB, due to broad available frequency range. +There, the curve of the normal-hearing profile stabilizes at levels of about 70\,dB. % The curves of the unaided and compensated simulations already stabilize at lower levels. % @@ -995,7 +1014,7 @@ \subsection*{Effect of PLATT expansion} \begin{table}[h] \small\sf\centering \caption{\label{tab:1} - Simulated benefits in SRT compared to the respective unaided conditions averaged over high presentation levels (70, 80, and 90\,dB SPL), where amplification cannot improve the SRT. NH indicates the normal-hearing listener profile. + Simulated benefits in SRT compared to the respective unaided conditions averaged over high presentation levels (70, 80, and 90\,dB SPL), where simple linear amplification cannot improve the SRT. NH indicates the normal-hearing listener profile. } \begin{tabular}{l|rrrr|r} \multicolumn{6}{r}{Stationary noise (OLNOISE)}\\ @@ -1062,8 +1081,6 @@ \subsection*{Effect of PLATT expansion} % An orientative value for the random error of the presented simulation data in Table~\ref{tab:1} is 1\,dB. - - For listener profiles with a level uncertainty of 1\,dB (P-*-1), only small benefits were observed. % In the stationary noise condition, processing the signals with PLATT had no effect on the SRTs ($\leq0.4$\,dB). @@ -1090,7 +1107,7 @@ \subsection*{Effect of PLATT expansion} % In other words, in the stationary noise condition, this mid-frequency portion was mostly redundant at the SRT, and the ASR system could discriminate the 50 words of the matrix sentence test almost equally well only using the low-frequency information. % -This was not the case for the fluctuating noise condition, where the mid-frequency portions contributed a substantial part of the information to achieve low SRTs (less than -15\,dB). +This was not the case for the fluctuating noise condition, where the mid-frequency portions contributed a substantial part of the information to achieve low SRTs (say, less than -15\,dB). % For the following presentation of the benefits with PLATT, it is important to keep in mind that this missing information, by design, cannot be compensated with PLATT. % @@ -1102,7 +1119,7 @@ \subsection*{Effect of PLATT expansion} % Hence, the maximum achievable improvement for profile P-2000-14 is then estimated by the difference $14.3-6.5=7.8$\,dB. -For profiles with increased level uncertainty (P-*-7, P-*-14, and P-*-21), the average benefits in SRT due to using PLATT were positive and range from 0.5 to 7.8\,dB. +For profiles with increased level uncertainty (P-*-7, P-*-14, and P-*-21), the average benefits in SRT due to using PLATT were positive and ranged from 0.5 to 7.8\,dB. % For these profiles, the lowest improvements were found with PLATT-2, and the highest improvements often, but not always, with PLATT-8. % @@ -1116,7 +1133,7 @@ \subsection*{Effect of PLATT expansion} % For profiles with a level uncertainty of 14 or 21\,dB, the improvements always increased together with the expansion factor. % -With high values of level uncertainty, higher expansion factors might further improve the SRT; however, increasing the dynamic of a portion eight fold might have undesirable collateral effects which are discussed later. +With high values of level uncertainty, higher expansion factors might further improve the SRT; however, increasing the dynamic of a signal portion eight fold might have undesirable collateral effects which are discussed later. In absolute terms, the improvements were generally larger in the fluctuating noise condition than in the stationary noise condition. % @@ -1126,11 +1143,11 @@ \subsection*{Effect of PLATT expansion} % While this interpretation correctly reflects the proportion of the total class D loss that was compensated, it does not reflect that a part of the class D loss cannot be compensated by expansion approach with PLATT by design. % -As explained before, the portion of the class D loss due to limiting the frequency range to 1000\,Hz is vastly different in both maskers. +As explained earlier, the portion of the class D loss due to limiting the frequency range to 1000\,Hz is very different in both maskers. % -To evaluate the achieved improvements with respect to the maximum achievable improvement, the class D loss due to limiting the frequency range needed to be disregarded. +To evaluate the achieved improvements with respect to the maximum achievable improvement, the class D loss due to limiting the frequency range needs to be disregarded. % -In the context of the maximum achievable improvement, PLATT-8 compensated 5.3\,dB of $(13.1-2.3=)10.8$\,dB and 7.8\,dB of $(26.3-15.0)=11.3$\,dB of the class D loss due to a level uncertainty of 21\,dB in the stationary and fluctuating noise condition, respectively. +In the context of the maximum achievable improvement, PLATT-8 compensated 5.3\,dB of $(13.1-2.3=)10.8$\,dB and 7.8\,dB of $(26.3-15.0=)11.3$\,dB of the class D loss due to a level uncertainty of 21\,dB in the stationary and fluctuating noise condition, respectively. % For the intermediate mixed profile P-2000-14, PLATT-6 compensated 3.0\,dB of $(6.5-1.2=)5.3$\,dB and 5.3\,dB of $(14.3-6.5=)7.8$\,dB in the stationary and fluctuating noise condition, respectively. % @@ -1140,9 +1157,9 @@ \subsection*{Effect of PLATT expansion} The absolute improvements due to PLATT tended to increase with an increased limitation of the frequency range. % -The effect of the level uncertainty was clearly frequency-dependent, and increased together with the limitation of the frequency range. +The effect of the level uncertainty increased with an increasing limitation of the frequency range. % -Based on the presented data, however, it is difficult to make statements about the frequency-dependency of an optimal factor for the expansion because of the divers non-linear interactions between the considered parameters. +Based on the presented data, however, it is difficult to make statements about the frequency-dependency of an optimal expansion factor because of the diverse non-linear interactions between the considered parameters. \subsection*{SNR-dependency of benefits in SRT} % @@ -1152,46 +1169,46 @@ \subsection*{SNR-dependency of benefits in SRT} % The main reason for simulating the SRT-50 was that it can be accurately measured in later listening experiments with human listeners, and accurate measurements are a requirement to show effects as small as 1\,dB. % -To assess if the improvements would, at least according to the model, translate to improvements at SRTs preferred in real conversations the psychometric functions of aided and unaided conditions were compared. +To assess if the improvements would (at least according to the model) translate to improvements at SRTs preferred in real conversations, the psychometric functions of aided and unaided conditions were compared. % Segments of psychometric functions were obtained by evaluating simulations at SRT-20, SRT-25, ..., SRT-90. % -Figure~\ref{fig:18}, \ref{fig:19}, and \ref{fig:20} present the unaided and aided psychometric functions for the mixed profiles P-4000-7, P-2000-14, and P-1000-21, respectively. +Figure~\ref{fig:19}, \ref{fig:20}, and \ref{fig:21} present the unaided and aided psychometric functions for the mixed profiles P-4000-7, P-2000-14, and P-1000-21, respectively. % \begin{figure} \centerline{\includegraphics[width=\columnwidth]{images/psyfun-P-4000-7}} \caption{Segments of psychometric functions for simulations with listener profile P-4000-7 without and with PLATT expansion factors 2, 4 and 6 in stationary noise (left panel) and fluctuating noise (right panel). The dotted and dashed lines indicate a word recognition rate of 50\% and 80\% correct, respectively.} - \label{fig:18} + \label{fig:19} \end{figure} % \begin{figure} \centerline{\includegraphics[width=\columnwidth]{images/psyfun-P-2000-14}} - \caption{Segments of psychometric functions for simulations with listener profile P-2000-14 without and with PLATT. Analog to Figure~\ref{fig:18}.} - \label{fig:19} + \caption{Segments of psychometric functions for simulations with listener profile P-2000-14 without and with PLATT. Analog to Figure~\ref{fig:19}.} + \label{fig:20} \end{figure} % \begin{figure} \centerline{\includegraphics[width=\columnwidth]{images/psyfun-P-1000-21}} - \caption{Segments of psychometric functions for simulations with listener profile P-1000-21 without and with PLATT. Analog to Figure~\ref{fig:18}.} - \label{fig:20} + \caption{Segments of psychometric functions for simulations with listener profile P-1000-21 without and with PLATT. Analog to Figure~\ref{fig:19}.} + \label{fig:21} \end{figure} % The simulation results in the unaided conditions are plotted in black, the corresponding simulation results with PLATT2, PLATT4, PLATT6, and PLATT8 expansion in blue, red, yellow, and purple color, respectively. % As expected, the slopes in the fluctuating noise conditions (right panels) were shallower than the slopes in the stationary noise condition (left panel). % -Also, as expected, the data points in the fluctuating noise conditions were more noisy, due to the much greater variability in the spectro-temporal distribution of the masker energy. +Also, as expected, the data points in the fluctuating noise conditions were more noisy, due to the greater variability in the spectro-temporal distribution of the masker energy. % This variability could be decreased by increasing the amount of training data and testing data. % Within the uncertainty due to this variability, no reduction in improvement between SRT-50 and SRT-80 can be observed for listener profile P-4000-7. % -For profile P-2000-14 (in Figure~\ref{fig:19}), where the improvements are larger, there was a trend towards a slight increase in slope with higher expansion factors. +For profile P-2000-14 (in Figure~\ref{fig:20}), where the improvements are larger, there was a trend towards a slight increase in slope with higher expansion factors. % This indicates, that the improvements due to the expansion with PLATT of the SRT-80 might be slightly larger than the corresponding improvement of the SRT-50. % -This trend was clearly confirmed by the data with the profile P-1000-21 in Figure~\ref{fig:20}. +This trend was confirmed by the data with the profile P-1000-21 in Figure~\ref{fig:21}. % According to the these simulations, the expansion with PLATT was found to improve the simulated SRT-80 to the same extent or even more than the SRT-50. @@ -1199,37 +1216,39 @@ \subsection*{SNR-dependency of benefits in SRT} \section*{Discussion} \label{sec:discussion} % -The presented experimental results were derived using a \emph{model} of auditory perception, more specifically, a model of human speech recognition based on automatic speech recognition. +The presented experimental results were derived using a model of auditory perception, more specifically, a model of impaired human speech recognition based on automatic speech recognition. % -The main aim of creating models is to bring assumptions into a form in which they can be tested, e.g. by comparing quantitative predictions to empirical measurements, and the respective discipline could be referred to as \emph{theoretical audiology}. +The modeling approach with FADE brings assumptions about the impaired human speech recognition process into a form in which they can be tested by comparing predictions with empirical data. % The employed model, the framework for auditory discrimination experiments (FADE) such as it was used by \cite{schaedler2020a}, was already evaluated with respect to predictions of the individual aided speech recognition performance of listeners with impaired hearing. % -There, as elaborated in the \nameref{sec:intoduction}, an important assumption was that the part of the hearing loss which cannot be explained by the absolute hearing threshold, that is, the missing piece to describe the effect of hearing loss on speech recognition in noise, can be explained by the level uncertainty and that this model parameter could be inferred from tone in noise detection tests. +There, as elaborated in the \nameref{sec:intoduction}, an important assumption was that the part of the hearing loss which cannot be explained by the absolute hearing threshold, that is, the missing piece to describe the effect of hearing loss on speech recognition in noise, can be explained by the level uncertainty. +% +Another assumption was, that this model parameter also affects tone in noise perception and hence its value could be inferred from tone in noise detection tests. % While the results supported this hypothesis, evidence was not sufficient to rule out other mechanisms that would also increase the SRTs in noise. % This is a fundamental problem in modeling the individual speech recognition performance. % -While the observable of the model, the SRT, can be empirically determined, it still depends on many correlated and non-linearly interacting parameters, including some which cannot be controlled very well (e.g. attention). +While the quantity that is predicted by the model, the SRT, can be measured in experiments with human listeners, the outcome of such measurements still depends on many correlated and non-linearly interacting parameters; some of which cannot be controlled very well, such as, e.g., attention. % -And even if the experimental results are measured with the most accurate methods, the measurement errors include this uncontrollable (human) variability which increases the required amount of data to falsify hypotheses to infeasible regions. +And even if the experimental results were measured with the most accurate methods, the measurement errors include this uncontrollable (human) variability which can increase the required amount of data to falsify hypotheses to infeasible regions. % This is especially true for hypotheses which predict relatively small effects. % -Hence, there is reasonable doubt about whether the removal of information in listeners with impaired hearing is really well described by the level uncertainty, or if it just coincidentally increased the SRT in the right conditions. +Hence, there is reasonable doubt about whether the removal of information in listeners with impaired hearing is really well described by the level uncertainty, or if it just coincidentally increased the SRT in the correct conditions. -To more specifically test if the level uncertainty describes the effect of hearing loss on speech recognition in noise, a suitable approach is trying to interact with it. +To more specifically test if the level uncertainty is suitable to describe the effect of hearing loss on speech recognition in noise, a promising approach is to interact with it. % The expansion of PLATT was specifically designed to interact with---namely compensate---the effect of the level uncertainty. % -The presented results clearly show that this interaction can be observed in model SRTs. +The modeled data clearly showed this interaction. % If this specific interaction was found in empirical data, it would be strongly supportive for the hypothesis of the existence of a mechanism in the human auditory system similar to the level uncertainty. % -Beyond the academical interest in the suitability of the assumptions to describe human speech recognition performance, a positive result would have immediate practical implications for the design of hearing loss compensation strategies. +Beyond the academical interest in the suitability of the assumptions to describe impaired human speech recognition performance, a positive result would have immediate practical implications for the design of hearing loss compensation strategies. -Let's recall that the goal was not to test if the expansion with PLATT improves speech recognition performance of listeners with impaired hearing. +Let's remember that the goal was not to test if the expansion with PLATT improves speech recognition performance of listeners with impaired hearing. % For that, measurements with listeners with impaired hearing will be necessary. % @@ -1237,7 +1256,7 @@ \section*{Discussion} % \begin{itemize} \item[A)] Present an approach which is able to partially compensate a class D loss as implemented with the level uncertainty in FADE, and - \item[B)] Objectively evaluate this approach and come up with testable quantitative hypothesis on the benefit in noisy listening conditions. + \item[B)] objectively evaluate this approach and come up with testable quantitative hypothesis on the benefit in noisy listening conditions. \end{itemize} % The latter aim, in other words, was to guide the planning of the measurements with listeners with impaired hearing towards optimal evidence. @@ -1268,9 +1287,9 @@ \subsection*{What is a realistic class D loss?} % An important observation he made was that data points from listeners with age-related hearing loss agreed well with data points based from studies on other sensorineural hearing impairments. % -As a consequence he considered age-related hearing loss to be primarily due to deterioration in the auditory pathway rather than to \enquote{mental impairment}. +As a consequence he considered age-related hearing loss to be primarily due to deterioration in the auditory pathway rather than to mental impairment. % -This last point is fundamental considering that mental impairment can probably not be compensated by signal processing. +This last point is fundamental considering that mental impairment can most likely not be compensated by signal processing. However, these findings have to be taken with care. % @@ -1278,15 +1297,15 @@ \subsection*{What is a realistic class D loss?} % The resulting list of possible systematic and random errors in the underlying data is long. % -The main contribution to the systematic error, that is an expectable difference across studies, apart from the calibration error and the different listener panels was probably the use of different speech tests (including measurement paradigm, speech material, masker, presentation, ...). +The main contribution to the systematic error (expectable differences across studies) apart from the calibration error and the different listener panels was probably the use of different speech tests (including measurement paradigm, speech material, masker, presentation, ...). % -It is known, that the speech material, e.g. logatomes, numbers, isolated words, or sentences, makes a difference in the outcome of a speech in noise recognition experiment. +It is known, that the type of speech material (e.g., logatomes, numbers, isolated words, or sentences) makes a difference in the outcome of a speech in noise recognition experiment. % Hence, tests with different speech material might also be differently susceptible to hearing loss and result in different values of SHL$_\text{D}$. % There is no reason to assume that SHL$_\text{D}$ is independent from the speech test. % -The main contribution to the random error, that is, the unpredictable difference across measurements, was probably different for the studies and due to the stochastic nature of the measurement procedures. +The main contribution to the random error (unpredictable differences across measurements) was probably different for the studies and due to the stochastic nature of the measurement procedures. % Both, the random and the systematic errors were not specifically considered in the analysis of \cite{plomp1978}. % @@ -1314,13 +1333,13 @@ \subsection*{What is a realistic class D loss?} % An interpretation of this relation could be that a given hearing loss in quiet is very likely related to a certain hearing loss in noise. % -But even this interpretation would be only valid if additional amplification would not change the relation, that is, if the same relation was observed for a noise presentation level of, e.g., 75\,dB SPL instead of 65\,dB SPL. +This interpretation would be only valid if additional amplification would not change the relation, that is, if the same relation was observed for a noise presentation level of, e.g., 75\,dB SPL instead of 65\,dB SPL. % However, it is difficult to speculate on that. % -On the one hand, the test-specific noise (OLNOISE) minimizes the effect of the individual hearing threshold. +On the one hand, the test-specific noise (OLNOISE) reduces the effect of the individual hearing threshold by masking the speech signals with a stationary matched-spectrum noise. % -On the other hand, for higher PTAs, the individual hearing threshold will eventually exceed the noise level at high frequencies, which can be compensated by amplification. +On the other hand, for higher PTAs, the individual hearing threshold will eventually exceed the noise level at high frequencies, which could be compensated by amplification. % Then again, according to the simulations presented in this contribution, the removal of high-frequency portions ($>4000$\,Hz) of the signals in the OLNOISE condition had little effect on the SRT. % @@ -1370,7 +1389,7 @@ \subsection*{Role of PLATT expansion} % The expansion feature of the PLATT dynamic range manipulation approach aims to mitigate the increase in SRT due to an increased level uncertainty. % -In a model-driven process, it was specifically designed to compensate the effect of the level uncertainty on speech recognition performance such as it is implemented in FADE. +It was specifically designed to compensate the effect of the level uncertainty on speech recognition performance such as it is implemented in FADE. % The presented simulation results showed that this necessary interim goal was achieved and indicate that about half of class D loss due to an increased level uncertainty was compensated. % @@ -1428,7 +1447,7 @@ \subsection*{Potential of compensating a class D loss} % This is because the individual benefit in SRT is derived from two individual measurements (aided and unaided), each generating a random error of about 0.7\,dB (standard deviation) for listeners with impaired hearing. % -If the real benefit is lower than about ($0.7\cdot\sqrt{2}\cdot1.96\approx1.94\approx$)2\,dB, a significant (with p-value $\approx0.05$) effect can be only shown in group averages but not in single individual measurements. +If the real benefit is lower than about ($0.7\cdot\sqrt{2}\cdot1.96\approx1.94\approx$)2\,dB, a significant (with p-value $<0.05$) effect can be only shown in group averages but not in single individual measurements. % However, showing an effect in individual measurements is highly preferable because then the data could be used to further analyze the individual characteristics of listeners with and without benefits. % @@ -1458,12 +1477,12 @@ \subsection*{Audio quality and loudness} % But the non-linear manipulation inevitable results in audible artifacts which probably affect audio quality perception. % -The effect of processing noisy speech signals at 0\,dB SNR with PLATT-1, PLATT-4, and PLATT-8 is illustrated in Figure~\ref{fig:21}. +The effect of processing noisy speech signals at 0\,dB SNR with PLATT-1, PLATT-4, and PLATT-8 is illustrated in Figure~\ref{fig:22}. % \begin{figure*}[h!] \centerline{\includegraphics[width=1.0\textwidth]{images/signal-platt-processed}} \caption{Illustration of the effect of the expansion factor with PLATT: Log Mel-spectrograms of processed noisy speech in stationary (left column) and fluctuating noise (right column) at 0\,dB SNR processed with PLATT-1 (top row), PLATT-4 (center row), and PLATT-8 (bottom row).} - \label{fig:21} + \label{fig:22} \end{figure*} % The spectro-temporal maxima of the shown log Mel-spectrograms are 68.8, 73.4, and 79.8\,dB SPL for the noisy signals in stationary noise, and 77.0, 81.8, and 89.3\,dB for the noisy signals in fluctuating noise. @@ -1506,11 +1525,11 @@ \subsection*{Proposed measurement conditions} % Based on the presented simulations and considerations, the following experimental measurement conditions are proposed to test the presented hypothesis that a part of a class D hearing loss can be compensated by expanding spectral patterns in the range between 2 and 4\,ERB in speech in noise recognition experiments with human listeners. % -Regarding the noise maskers, an evaluation with the test-specific noise (e.g. OLNOISE for the German matrix sentence test), the fluctuating ICRA5-250 noise, and, in addition, a competing voice masker, e.g., the International Speech Test Signal \citep[ISTS;][]{holube2010}, is proposed. +Regarding the noise maskers, an evaluation with the test-specific noise (e.g. OLNOISE for the German matrix sentence test), the fluctuating ICRA5-250 noise, and, in addition, a competing voice masker, e.g., the International Speech Test Signal \citep[ISTS;][]{holube2010} or its optimized version with limited pause durations, the International Female Fluctuating Masker (IFFM\footnote{\url{https://www.ehima.com/documents/}}), is proposed. % -The ISTS masker is interesting because it shares the speech modulation properties with the target speaker and usually results in very weak masking for listeners with normal hearing. +The IFFM masker is interesting because it shares the speech modulation properties with the target speaker and usually results in very weak masking for listeners with normal hearing. % -It was not included in the objective evaluation with FADE because predictions with FADE for competing talker scenarios are known to be inaccurate; \cite{schaedler2018} reported the effect of the masker to be overestimated by approx 10\,dB. +It was not included in the objective evaluation with FADE because predictions with FADE for competing talker scenarios are known to be inaccurate; \cite{schaedler2018} reported the effect of the masker to be overestimated by approximately 10\,dB. The noise presentation levels ideally would be sufficiently high to compensate any hearing loss which is compensable by simple amplification. % @@ -1557,7 +1576,7 @@ \subsection*{Proposed measurement conditions} This additional data could be used to perform individual predictions of benefits with FADE and compare them to the individual measurements. % The comparison of individual measurements and predictions is crucial to -understand and correct invalid assumptions in the model. +detect invalid assumptions in the model. \subsection*{Final practical considerations} % @@ -1571,191 +1590,179 @@ \subsection*{Final practical considerations} % Hence, as a compromise, measurements with lists of 20 sentences could be used and repeated on a different day, which would also allow to detect a possibly remaining training effect. -For the first approach, headphone measurements are preferable over free-field measurements because they can be performed monaurally, which is recommendable. +For a first approach, headphone measurements are preferable over free-field measurements because they can be performed monaurally, which is recommendable. % -A monaural presentation prevents the interference of possibly individual binaural effects and facilitate the comparison to model predictions, as discussed by \cite{schaedler2020a}. +A monaural presentation prevents the interference of possibly individual binaural effects and facilitates the comparison to model predictions, as discussed by \cite{schaedler2020a}. + +\subsection*{Outlook} +% +Once there is evidence whether the proposed compensation strategy has a positive effect on speech recognition performance of listeners with impaired hearing, the band-width limitation to 2000\,Hz that was recommended for the first experiments should be removed. +% +For the next steps, it would not be important anymore to demonstrate the compensation of a class D loss alone, but to show benefits in more realistic and individually optimized configurations. +% +Hence, there should be a shift towards a joint individual optimization of the compensation of class A and class D loss, loudness perception, and audio quality. +% +Then, care should be taken to individually normalize loudness perception to get meaningful data. % -If the expansion approach with PLATT proves to work in monaural listening conditions, the concept should be extended to a binaural listening condition, for which a free-field setup with a mobile hearing aid prototype hardware is recommendable to correctly assess individual binaural hearing, such as, e.g., binaural loudness perception. +If the expansion approach with PLATT proves to work in monaural listening conditions, the concept should be extended to a binaural listening condition. +% +For this, a free-field setup with a mobile hearing aid prototype hardware is recommendable to correctly assess individual binaural hearing, including binaural loudness perception. +% +To better understand which speech portions are most affected by PLATT expansion, it would also be interesting to study its effect on phonemic contrasts. %% CONCLUSIONS \section*{Conclusions} \label{sec:conclusion} The most important findings of this work can be summarized as follows: \begin{itemize} - \item The functional modeling of the class D hearing loss with the framework for auditory discrimination experiments (FADE), implemented by means of the level uncertainty, was interpreted as the counterpart of a compensation strategy which aims to (partially) compensate a class D loss. + \item The functional modeling of the class D hearing loss with the framework for auditory discrimination experiments (FADE), implemented by means of the level uncertainty, was interpreted as the counterpart of a compensation strategy which aims to (partially) compensate a class D heaing loss. % \item The strict low-delay constraints in hearing aid applications only allow for a manipulation of mainly spectral modulation patterns. Of these, the patterns in the range of 2 to 4\,ERB seem especially suitable to protect them against the effect of the level uncertainty by dynamic range expansion. % - \item A low-delay, real-time capable implementation of a patented dynamic range manipulation scheme (PLATT) which allows to perform the required dynamic range expansion was proposed. The implementation was optimized to run in real-time on the Raspberry Pi 3B platform. + \item A low-delay, real-time capable implementation of a patented dynamic range manipulation scheme (PLATT) which allows to perform the required dynamic range expansion was proposed. The implementation was optimized to run in real-time on the Raspberry Pi 3 Model B platform. % \item The evaluation of the PLATT implementation with FADE for several idealized profiles of hearing loss indicated that, according to the model, approximately half of the class D hearing loss due to an increased level uncertainty was compensable. % - \item FADE was used for the first time in an attempt to formulate quantitative hypothesis on the outcome of specific speech recognition experiment \emph{prior} to performing these. The hypothesis, that a hearing loss of class D can be (partially) compensated in the considered noisy listening conditions can be directly tested with human listeners in the same listening conditions. Recommendations for this experiment were elaborated. + \item FADE was used for the first time in an attempt to formulate quantitative hypothesis on the outcome of specific speech recognition experiments \emph{prior} to performing these. The hypothesis, that a hearing loss of class D can be (partially) compensated in the considered noisy listening conditions can be directly tested with human listeners in the same listening conditions. Recommendations for this experiment were elaborated. \end{itemize} \bibliographystyle{apalike} \begin{thebibliography}{} \bibitem[Bisgaard et~al., 2010]{bisgaard2010} - Bisgaard, N., Vlaming, M.~S., and Dahlquist, M. (2010). + Bisgaard, N., Vlaming, M.~S., and Dahlquist, M. (2010) \newblock Standard audiograms for the IEC 60118-15 measurement procedure. - \newblock {\em Trends in amplification}, 14(2):113--120. - - \bibitem[Bramhall et~al., 2019]{bramhall2019} - Bramhall, N., Beach, E.~F., Epp, B., Le Prell, C.~G., Lopez-Poveda, E.~A., Plack, C.~J., Schaette, R., Verhulst, S., and Canlon, B. (2019). - \newblock The search for noise-induced cochlear synaptopathy in humans: Mission impossible? - \newblock {\em Hearing Research}, 377:88--103. - - \bibitem[Castro Martinez and Schädler, 2016]{castromartinez2016} - Castro Martinez, A.~M. and Schädler, M.~R. (2016). - \newblock Why do ASR Systems Despite Neural Nets Still Depend on Robust Features. - \newblock In {\em Proceedings of INTERSPEECH}, pages 1883--1887. - + \newblock {\em Trends in amplification}, 14(2):113--120, \url{https://doi.org/10.1177%2F1084713810379609} + \bibitem[Dreschler, 1992]{dreschler1992} - Dreschler, W.~A. (1992). + Dreschler, W.~A. (1992) \newblock Fitting multichannel-compression hearing aids. - \newblock {\em Audiology}, 31(3):121--131. - + \newblock {\em Audiology}, 31(3):121--131, \url{https://doi.org/10.3109/00206099209072907} + \bibitem[Dreschler et~al., 2001]{dreschler2001} - Dreschler, W.~A., Verschuure, H., Ludvigsen, C., and Westermann, S. (2001). - \newblock ICRA noises: artificial noise signals with speech-like spectral and temporal properties for hearing instrument assessment - \newblock {\em Audiology}, 40(3):148--157. - + Dreschler, W.~A., Verschuure, H., Ludvigsen, C., and Westermann, S. (2001) + \newblock ICRA noises: artificial noise signals with speech-like spectral and temporal properties for hearing instrument assessment. + \newblock {\em Audiology}, 40(3):148--157, \url{https://doi.org/10.3109/00206090109073110} + \bibitem[ETSI, 2007]{etsi2007} - European Telecommunications Standards Institute (2007). + European Telecommunications Standards Institute (2007) \newblock "202 050 v1.1.5" Speech processing transmission and quality aspects (STQ); Distributed speech recognition; Advanced front-end feature extraction algorithm; Compression algorithms. - \newblock {\em Standard}. - + \newblock {\em Standard}, \url{https://www.etsi.org/deliver/etsi_es/202000_202099/202050/01.01.05_60/es_202050v010105p.pdf} + \bibitem[Grimm et~al., 2015]{grimm2015} - Grimm, G., Herzke, T., Ewert, S., and Hohmann, V. (2015). - \newblock Implementation and evaluation of an experimental hearing aid dynamic range compressor - \newblock In {\em Proceedings of German Annual Conference on Acoustics}, 185--188. - + Grimm, G., Herzke, T., Ewert, S., and Hohmann, V. (2015) + \newblock Implementation and evaluation of an experimental hearing aid dynamic range compressor. + \newblock In {\em Proceedings of German Annual Conference on Acoustics}, 185--188, \url{http://pub.dega-akustik.de/DAGA_2015/data/articles/000429.pdf} + \bibitem[Hochmuth et~al., 2015]{hochmuth2015} - Hochmuth, S., Kollmeier, B., Brand, T., and Jürgens, T. (2015). - \newblock Influence of noise type on speech reception thresholds across four languages measured with matrix sentence tests - \newblock {\em International Journal of Audiology}, 54(sup2):62--70. + Hochmuth, S., Kollmeier, B., Brand, T., and Jürgens, T. (2015) + \newblock Influence of noise type on speech reception thresholds across four languages measured with matrix sentence tests. + \newblock {\em International Journal of Audiology}, 54(sup2):62--70, \url{https://doi.org/10.3109/14992027.2015.1046502} \bibitem[Hohmann and Kollmeier, 1995]{hohmann1995} - Hohmann, V. and Kollmeier, B. (1995). + Hohmann, V. and Kollmeier, B. (1995) \newblock The effect of multichannel dynamic compression on speech intelligibility. - \newblock {\em The Journal of the Acoustical Society of America}, 97(2):1191--1195. + \newblock {\em The Journal of the Acoustical Society of America}, 97(2):1191--1195, \url{https://doi.org/10.1121/1.413092} \bibitem[Holube et~al., 2010]{holube2010} - Holube, I., Fredelake, S., Vlaming, M. and Kollmeier, B. (2010). - \newblock Development and analysis of an international speech test signal (ISTS) - \newblock {\em International Journal of Audiology}, 49(12):891--903, \url{https://doi.org/10.3109/14992027.2010.506889}. + Holube, I., Fredelake, S., Vlaming, M. and Kollmeier, B. (2010) + \newblock Development and analysis of an international speech test signal (ISTS). + \newblock {\em International Journal of Audiology}, 49(12):891--903, \url{https://doi.org/10.3109/14992027.2010.506889} \bibitem[Hülsmeier et~al., 2020]{huelsmeier2020} - Hülsmeier, D., Warzybok, A., Kollmeier, B., and Schädler, M.~R. (2020). - \newblock Simulations with FADE of the effect of impaired hearing on speech recognition performance cast doubt on the role of spectral resolution - \newblock {\em Hearing Research}, 395. + Hülsmeier, D., Warzybok, A., Kollmeier, B., and Schädler, M.~R. (2020) + \newblock Simulations with FADE of the effect of impaired hearing on speech recognition performance cast doubt on the role of spectral resolution. + \newblock {\em Hearing Research}, 395, \url{https://doi.org/10.1016/j.heares.2020.107995} \bibitem[ISO 226, 2003]{iso2003} ISO (2003). \newblock Standard 226: 2003: Acoustics--normal equal-loudness-level contours. - \newblock {\em International Organization for Standardization}, 63. + \newblock {\em International Organization for Standardization}, 63, \url{https://www.iso.org/standard/34222.html} \bibitem[Kollmeier et~al., 2015]{kollmeier2015} - Kollmeier, B., Warzybok, A., Hochmuth, S., Zokoll, M.~A., Uslar, V., Brand, T., and Wagener, K.~C. (2015). - \newblock The multilingual matrix test: Principles, applications, and - comparison across languages: A review. - \newblock {\em International Journal of Audiology}, 54(sup2):3--16. + Kollmeier, B., Warzybok, A., Hochmuth, S., Zokoll, M.~A., Uslar, V., Brand, T., and Wagener, K.~C. (2015) + \newblock The multilingual matrix test: Principles, applications, and comparison across languages: A review. + \newblock {\em International Journal of Audiology}, 54(sup2):3--16, \url{https://doi.org/10.3109/14992027.2015.1020971}. \bibitem[Kollmeier et~al., 2016]{kollmeier2016} - Kollmeier, B., Schädler, M.~R., Warzybok, A., Meyer, B.~T., and Brand, T. - (2016). - \newblock Sentence recognition prediction for hearing-impaired listeners in - stationary and fluctuation noise with fade: Empowering the attenuation and - distortion concept by Plomp with a quantitative processing model. - \newblock {\em Trends in Hearing}, 20. + Kollmeier, B., Schädler, M.~R., Warzybok, A., Meyer, B.~T., and Brand, T. (2016) + \newblock Sentence recognition prediction for hearing-impaired listeners in stationary and fluctuation noise with fade: Empowering the attenuation and distortion concept by Plomp with a quantitative processing model. + \newblock {\em Trends in Hearing}, 20, \url{https://doi.org/10.1177%2F2331216516655795} \bibitem[Moore et~al., 1999]{moore1999} - Moore, B.~C.~J., Peters, R.~W., and Stone, M.~A. (1999). + Moore, B.~C.~J., Peters, R.~W., and Stone, M.~A. (1999) \newblock Benefits of linear amplification and multichannel compression for speech comprehension in backgrounds with spectral and temporal dips. - \newblock {\em The Journal of the Acoustical Society of America}, 105(1):400--411. + \newblock {\em The Journal of the Acoustical Society of America}, 105(1):400--411, \url{https://doi.org/10.1121/1.424571} \bibitem[Plomp, 1978]{plomp1978} - Plomp, R. (1978). + Plomp, R. (1978) \newblock Auditory handicap of hearing impairment and the limited benefit of hearing aids. - \newblock {\em The Journal of the Acoustical Society of America}, 63(2):533--549. + \newblock {\em The Journal of the Acoustical Society of America}, 63(2):533--549, \url{https://doi.org/10.1121/1.381753} \bibitem[Plomp, 1988]{plomp1988} - Plomp, R. (1988). + Plomp, R. (1988) \newblock The negative effect of amplitude compression in multichannel hearing aids in the light of the modulation-transfer function. - \newblock {\em The Journal of the Acoustical Society of America}, 83(6):2322--2327. + \newblock {\em The Journal of the Acoustical Society of America}, 83(6):2322--2327, \url{https://doi.org/10.1121/1.396363} \bibitem[Schädler et~al., 2012]{schaedler2012} - Schädler, M.~R., Meyer, B., and Kollmeier, B. (2012). + Schädler, M.~R., Meyer, B., and Kollmeier, B. (2012) \newblock Spectro-temporal modulation subspace-spanning filter bank features for robust automatic speech recognition. - \newblock {\em The Journal of the Acoustical Society of America}, - 131(5):4134--4151. + \newblock {\em The Journal of the Acoustical Society of America}, 131(5):4134--4151, \url{https://doi.org/10.1121/1.3699200} \bibitem[Schädler et~al., 2015]{schaedler2015} - Schädler, M.~R., Warzybok, A., Hochmuth, S., and Kollmeier, B. (2015). - \newblock Matrix sentence intelligibility prediction using an automatic speech - recognition system. - \newblock {\em International Journal of Audiology}, 54(sup2):100--107. + Schädler, M.~R., Warzybok, A., Hochmuth, S., and Kollmeier, B. (2015) + \newblock Matrix sentence intelligibility prediction using an automatic speech recognition system. + \newblock {\em International Journal of Audiology}, 54(sup2):100--107, \url{https://doi.org/10.3109/14992027.2015.1061708} - \bibitem[Schädler et~al., 2016b]{schaedler2016a} - Schädler, M.~R., Warzybok, A., Ewert, S.~D., and Kollmeier, B. (2016b). - \newblock A simulation framework for auditory discrimination experiments: - Revealing the importance of across-frequency processing in speech perception. - \newblock {\em The Journal of the Acoustical Society of America}, - 139(5):2708--2722. + \bibitem[Schädler et~al., 2016a]{schaedler2016a} + Schädler, M.~R., Warzybok, A., Ewert, S.~D., and Kollmeier, B. (2016b) + \newblock A simulation framework for auditory discrimination experiments: Revealing the importance of across-frequency processing in speech perception. + \newblock {\em The Journal of the Acoustical Society of America}, 139(5):2708--2722, \url{https://doi.org/10.1121/1.4948772} - \bibitem[Schädler et~al., 2016a]{schaedler2016b} - Schädler, M.~R., Hülsmeier, D., Warzybok, A., Hochmuth, S., and Kollmeier, B. - (2016a). - \newblock Microscopic multilingual matrix test predictions using an ASR-based - speech recognition model. - \newblock In {\em Proceedings of INTERSPEECH}, 610--614. + \bibitem[Schädler et~al., 2016b]{schaedler2016b} + Schädler, M.~R., Hülsmeier, D., Warzybok, A., Hochmuth, S., and Kollmeier, B. (2016a) + \newblock Microscopic multilingual matrix test predictions using an ASR-based speech recognition model. + \newblock In {\em Proceedings of INTERSPEECH}, 610--614, \url{http://dx.doi.org/10.21437/Interspeech.2016-1119} \bibitem[Schädler et~al., 2018]{schaedler2018} - Schädler, M.~R., Warzybok, A., and Kollmeier, B. (2018). - \newblock {Objective Prediction of Hearing Aid Benefit Across Listener Groups Using Machine Learning: Speech Recognition Performance With Binaural Noise-Reduction Algorithms.} + Schädler, M.~R., Warzybok, A., and Kollmeier, B. (2018) + \newblock Objective Prediction of Hearing Aid Benefit Across Listener Groups Using Machine Learning: Speech Recognition Performance With Binaural Noise-Reduction Algorithms. \newblock {\em Trends in Hearing}, 22, \url{https://doi.org/10.1177/2331216518768954}. \bibitem[Schädler et~al., 2020a]{schaedler2020a} - Schädler, M.~R., Hülsmeier, D., Warzybok, A., and Kollmeier, B. (2020). - \newblock {Individual Aided Speech-Recognition Performance and Predictions of Benefit for Listeners With Impaired Hearing Employing FADE.} - \newblock {\em Trends in Hearing}, 24. + Schädler, M.~R., Hülsmeier, D., Warzybok, A., and Kollmeier, B. (2020) + \newblock Individual Aided Speech-Recognition Performance and Predictions of Benefit for Listeners With Impaired Hearing Employing FADE. + \newblock {\em Trends in Hearing}, 24, \url{https://doi.org/10.1177%2F2331216520938929} \bibitem[Schädler, 2020b]{schaedler2020b} - Schädler, M.~R. (2020b). + Schädler, M.~R. (2020b) \newblock Optimization and evaluation of an intelligibility-improving signal processing approach (IISPA) for the Hurricane Challenge 2.0 with FADE. - \newblock In {\em Proceedings of INTERSPEECH}, 1331--1335. + \newblock In {\em Proceedings of INTERSPEECH}, 1331--1335, \url{https://doi.org/10.21437/Interspeech.2020-0093} \bibitem[Souza, 2002]{souza2002} - Souza, P.~E. (2002). + Souza, P.~E. (2002) \newblock Effects of compression on speech acoustics, intelligibility, and sound quality. - \newblock {\em Trends in Amplification}, 6(4):131--165. - - \bibitem[Verhulst and Warzybok, 2018]{verhulst2018} - Verhulst, S. and Warzybok, A. (2018). - \newblock Contributions of Low- and High-Frequency Sensorineural Hearing Deficits to Speech Intelligibility in Noise. - \newblock {\em bioRxiv}, \url{https://doi.org/10.1101/358127}. + \newblock {\em Trends in Amplification}, 6(4):131--165, \url{https://doi.org/10.1177%2F108471380200600402} \bibitem[Wagener et~al., 1999]{wagener1999} - Wagener, K., Brand, T., and Kollmeier, B. (1999). + Wagener, K., Brand, T., and Kollmeier, B. (1999) \newblock Entwicklung und Evaluation eines Satztests für die Deutsche Sprache I-III: Design, Optimierung und Evaluation des Oldenburger Satztests. - \newblock {\em Zeitschrift für Audiologie}, 38(1-3):4--15. + \newblock {\em Zeitschrift für Audiologie}, 38(1-3):4--15 \bibitem[Wagener et~al., 2006]{wagener2006} - Wagener, K.~C., Brand, T., and Kollmeier, B. (2006). + Wagener, K.~C., Brand, T., and Kollmeier, B. (2006) \newblock The role of silent intervals for sentence intelligibility in fluctuating noise in hearing-impaired listeners. - \newblock {\em International Journal of Audiology}, 45(1):26--33. + \newblock {\em International Journal of Audiology}, 45(1):26--33, \url{https://doi.org/10.1080/14992020500243851} \bibitem[Wardenga et~al., 2015]{wardenga2015} - Wardenga, N., Batsoulis, C., Wagener, K.~C., Brand, T., Lenarz, T., and Maier, H. (2015). + Wardenga, N., Batsoulis, C., Wagener, K.~C., Brand, T., Lenarz, T., and Maier, H. (2015) \newblock Do you hear the noise? The German matrix sentence test with a fixed noise level in subjects with normal hearing and hearing impairment. \newblock {\em International Journal of Audiology}, 54(sup2):71--79, \url{https://doi.org/10.3109/14992027.2015.1079929}. \bibitem[Yund and Buckles, 1995]{yund1995} - Yund, E.~W. and Buckles, K.~M. (1995). + Yund, E.~W. and Buckles, K.~M. (1995) \newblock Multichannel compression hearing aids: Effect of number of channels on speech discrimination in noise. - \newblock {\em The Journal of the Acoustical Society of America}, - 97(2):1206--1223. + \newblock {\em The Journal of the Acoustical Society of America}, 97(2):1206--1223, \url{https://doi.org/10.1121/1.413093} \end{thebibliography}