Skip to content

Commit 232f9de

Browse files
authored
[TTS] Fix TTS audio preprocessing bugs (NVIDIA#6628)
Signed-off-by: Ryan <[email protected]>
1 parent f8e46a9 commit 232f9de

File tree

5 files changed

+76
-79
lines changed

5 files changed

+76
-79
lines changed

nemo/collections/tts/parts/preprocessing/features.py

+4
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,14 @@ def __init__(
131131
n_fft=win_length,
132132
lowfreq=lowfreq,
133133
highfreq=highfreq,
134+
mag_power=1.0,
134135
log=log,
135136
log_zero_guard_type=log_zero_guard_type,
136137
log_zero_guard_value=log_zero_guard_value,
137138
mel_norm=mel_norm,
139+
normalize=None,
140+
preemph=None,
141+
dither=0.0,
138142
)
139143

140144
def compute_mel_spec(self, manifest_entry: dict, audio_dir: Path) -> Tensor:

nemo/collections/tts/parts/utils/tts_dataset_utils.py

+3
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ def normalize_volume(audio: np.array, volume_level: float) -> np.array:
6767
if not (0.0 <= volume_level <= 1.0):
6868
raise ValueError(f"Volume must be in range [0.0, 1.0], received {volume_level}")
6969

70+
if audio.size == 0:
71+
return audio
72+
7073
max_sample = np.max(np.abs(audio))
7174
if max_sample == 0:
7275
return audio

scripts/dataset_processing/tts/audio_processing/preprocess_audio.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _process_entry(
128128
if audio_trimmer is not None:
129129
audio, start_i, end_i = audio_trimmer.trim_audio(audio=audio, sample_rate=sample_rate, audio_id=audio_path)
130130

131-
if output_sample_rate is not None:
131+
if output_sample_rate:
132132
audio = librosa.resample(y=audio, orig_sr=sample_rate, target_sr=output_sample_rate)
133133
sample_rate = output_sample_rate
134134

@@ -140,7 +140,7 @@ def _process_entry(
140140
original_duration = librosa.get_duration(filename=audio_path)
141141
output_duration = librosa.get_duration(filename=output_path)
142142

143-
entry["duration"] = output_duration
143+
entry["duration"] = round(output_duration, 2)
144144

145145
if os.path.isabs(audio_filepath):
146146
entry["audio_filepath"] = output_path

tests/collections/tts/data/test_data_utils.py

-76
This file was deleted.

tests/collections/tts/parts/utils/test_tts_dataset_utils.py

+67-1
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@
1414

1515
from pathlib import Path
1616

17+
import numpy as np
1718
import pytest
1819

19-
from nemo.collections.tts.parts.utils.tts_dataset_utils import get_abs_rel_paths, get_audio_filepaths
20+
from nemo.collections.tts.parts.utils.tts_dataset_utils import get_abs_rel_paths, get_audio_filepaths, normalize_volume
2021

2122

2223
class TestTTSDatasetUtils:
@@ -53,3 +54,68 @@ def test_get_audio_paths(self):
5354

5455
assert abs_path == Path("/home/audio/examples/example.wav")
5556
assert rel_path == audio_rel_path
57+
58+
@pytest.mark.run_only_on('CPU')
59+
@pytest.mark.unit
60+
def test_normalize_volume(self):
61+
input_audio = np.array([0.0, 0.1, 0.3, 0.5])
62+
expected_output = np.array([0.0, 0.18, 0.54, 0.9])
63+
64+
output_audio = normalize_volume(audio=input_audio, volume_level=0.9)
65+
66+
np.testing.assert_array_almost_equal(output_audio, expected_output)
67+
68+
@pytest.mark.run_only_on('CPU')
69+
@pytest.mark.unit
70+
def test_normalize_volume_negative_peak(self):
71+
input_audio = np.array([0.0, 0.1, -0.3, -1.0, 0.5])
72+
expected_output = np.array([0.0, 0.05, -0.15, -0.5, 0.25])
73+
74+
output_audio = normalize_volume(audio=input_audio, volume_level=0.5)
75+
76+
np.testing.assert_array_almost_equal(output_audio, expected_output)
77+
78+
@pytest.mark.run_only_on('CPU')
79+
@pytest.mark.unit
80+
def test_normalize_volume_zero(self):
81+
input_audio = np.array([0.0, 0.1, 0.3, 0.5])
82+
expected_output = np.array([0.0, 0.0, 0.0, 0.0])
83+
84+
output_audio = normalize_volume(audio=input_audio, volume_level=0.0)
85+
86+
np.testing.assert_array_almost_equal(output_audio, expected_output)
87+
88+
@pytest.mark.run_only_on('CPU')
89+
@pytest.mark.unit
90+
def test_normalize_volume_max(self):
91+
input_audio = np.array([0.0, 0.1, 0.3, 0.5])
92+
expected_output = np.array([0.0, 0.2, 0.6, 1.0])
93+
94+
output_audio = normalize_volume(audio=input_audio, volume_level=1.0)
95+
96+
np.testing.assert_array_almost_equal(output_audio, expected_output)
97+
98+
@pytest.mark.run_only_on('CPU')
99+
@pytest.mark.unit
100+
def test_normalize_volume_zeros(self):
101+
input_audio = np.array([0.0, 0.0, 0.0])
102+
103+
output_audio = normalize_volume(audio=input_audio, volume_level=0.5)
104+
105+
np.testing.assert_array_almost_equal(output_audio, input_audio)
106+
107+
@pytest.mark.run_only_on('CPU')
108+
@pytest.mark.unit
109+
def test_normalize_volume_empty(self):
110+
input_audio = np.array([])
111+
112+
output_audio = normalize_volume(audio=input_audio, volume_level=1.0)
113+
114+
np.testing.assert_array_almost_equal(output_audio, input_audio)
115+
116+
@pytest.mark.run_only_on('CPU')
117+
@pytest.mark.unit
118+
def test_normalize_volume_out_of_range(self):
119+
input_audio = np.array([0.0, 0.1, 0.3, 0.5])
120+
with pytest.raises(ValueError, match="Volume must be in range"):
121+
normalize_volume(audio=input_audio, volume_level=2.0)

0 commit comments

Comments
 (0)