From 9a4a03e400a1a36c9f9b694281ffbc5a855d1237 Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Wed, 26 Jun 2024 22:27:00 +0300 Subject: [PATCH] Replace wav intermediate processing format to mp3 --- pyproject.toml | 1 - tafrigh/audio_splitter.py | 115 ++++++++++---------------- tafrigh/cli.py | 10 +-- tafrigh/downloader.py | 5 +- tafrigh/recognizers/wit_recognizer.py | 6 +- tafrigh/utils/wit/file_utils.py | 6 +- 6 files changed, 57 insertions(+), 86 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 042a0d6..0431246 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,6 @@ wit = [ "numpy>=1.26.4", "pydub>=0.25.1", "requests>=2.32.0", - "scipy>=1.13.0", ] whisper = [ "faster-whisper>=1.0.2", diff --git a/tafrigh/audio_splitter.py b/tafrigh/audio_splitter.py index be3bc1d..c98c282 100644 --- a/tafrigh/audio_splitter.py +++ b/tafrigh/audio_splitter.py @@ -3,8 +3,10 @@ import numpy as np +from auditok import AudioRegion from auditok.core import split -from scipy.io import wavfile +from pydub import AudioSegment +from pydub.generators import WhiteNoise class AudioSplitter: @@ -18,92 +20,61 @@ def split( energy_threshold: float = 50, expand_segments_with_noise: bool = False, noise_seconds: int = 1, - noise_amplitude: int = 10, + noise_amplitude: int = 0, ) -> list[tuple[str, float, float]]: - sampling_rate, data = self._read_audio(file_path) - temp_file_name = self._write_temp_audio(sampling_rate, data) - segments = self._split_audio(temp_file_name, min_dur, max_dur, max_silence, energy_threshold) - - os.remove(temp_file_name) - - if expand_segments_with_noise: - expanded_segments = self._expand_segments_with_noise( - segments, - noise_seconds, - noise_amplitude, - sampling_rate, - data.dtype, - ) - else: - expanded_segments = [(segment, segment.meta.start, segment.meta.end) for segment in segments] - - return self._save_segments(output_dir, sampling_rate, expanded_segments) - - def _read_audio(self, file_path: str) -> tuple[int, np.ndarray]: - sampling_rate, data = wavfile.read(file_path) - - if len(data.shape) > 1 and data.shape[1] > 1: - data = np.mean(data, axis=1) - - return sampling_rate, data - - def _write_audio(self, file_path: str, sampling_rate: int, data: np.ndarray) -> None: - wavfile.write(file_path, sampling_rate, data.astype(np.int16)) - - def _write_temp_audio(self, sampling_rate: int, data: np.ndarray) -> str: - with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: - temp_file_name = temp_file.name - self._write_audio(temp_file_name, sampling_rate, data) - - return temp_file_name - - def _split_audio( - self, - temp_file_name: str, - min_dur: float, - max_dur: float, - max_silence: float, - energy_threshold: float, - ): - return split( - temp_file_name, + segments = split( + file_path, min_dur=min_dur, max_dur=max_dur, max_silence=max_silence, energy_threshold=energy_threshold, ) - def _expand_segments_with_noise( + if expand_segments_with_noise: + segments = [ + ( + self._expand_segment_with_noise(segment, noise_seconds, noise_amplitude), + segment.meta.start, + segment.meta.end, + ) for segment in segments + ] + + return self._save_segments(output_dir, segments) + + def _expand_segment_with_noise( self, - segments: list, + segment: AudioRegion, noise_seconds: int, noise_amplitude: int, - sampling_rate: int, - dtype: np.dtype, - ) -> list[tuple[np.ndarray, float, float]]: - expanded_segments = [] + ) -> AudioSegment: - for segment in segments: - # Have different noise in the beginning and the end gave us better results :). - prepend_noise = np.random.normal(0, noise_amplitude, int(noise_seconds * sampling_rate)).astype(dtype) - append_noise = np.random.normal(0, noise_amplitude, int(noise_seconds * sampling_rate)).astype(dtype) + audio_segment = AudioSegment( + segment._data, + frame_rate=segment.sampling_rate, + sample_width=segment.sample_width, + channels=segment.channels, + ) - expanded_segment = np.concatenate((prepend_noise, segment, append_noise)) - expanded_segments.append((expanded_segment, segment.meta.start, segment.meta.end)) + pre_noise = WhiteNoise().to_audio_segment(duration=noise_seconds * 1000, volume=noise_amplitude) + post_noise = WhiteNoise().to_audio_segment(duration=noise_seconds * 1000, volume=noise_amplitude) - return expanded_segments + return pre_noise + audio_segment + post_noise def _save_segments( self, output_dir: str, - sampling_rate: int, - expanded_segments: list[tuple[np.ndarray, float, float]], + segments: list[AudioSegment | tuple[AudioSegment, float, float]], ) -> list[tuple[str, float, float]]: - segments = [] - - for i, (expanded_segment, start, end) in enumerate(expanded_segments): - output_file = os.path.join(output_dir, f"segment_{i + 1}.wav") - self._write_audio(output_file, sampling_rate, expanded_segment) - segments.append((output_file, start, end)) - - return segments + segment_paths = [] + + for i, segment in enumerate(segments): + output_file = os.path.join(output_dir, f'segment_{i + 1}.mp3') + + if isinstance(segment, tuple): + segment[0].export(output_file, format='mp3') + segment_paths.append((output_file, segment[1], segment[2])) + else: + segment.save(output_file) + segment_paths.append((output_file, segment.meta.start, segment.meta.end)) + + return segment_paths diff --git a/tafrigh/cli.py b/tafrigh/cli.py index df0430e..61beb6b 100644 --- a/tafrigh/cli.py +++ b/tafrigh/cli.py @@ -148,8 +148,8 @@ def process_local( file_path = str(file['file_path'].absolute()) if config.use_wit(): - wav_file_path = str(wit_file_utils.convert_to_wav(file['file_path']).absolute()) - recognize_generator = WitRecognizer(verbose=config.input.verbose).recognize(wav_file_path, config.wit) + mp3_file_path = str(wit_file_utils.convert_to_mp3(file['file_path']).absolute()) + recognize_generator = WitRecognizer(verbose=config.input.verbose).recognize(mp3_file_path, config.wit) else: recognize_generator = WhisperRecognizer(verbose=config.input.verbose).recognize( file_path, @@ -165,8 +165,8 @@ def process_local( segments = exception.value break - if config.use_wit() and file['file_path'].suffix != '.wav': - Path(wav_file_path).unlink(missing_ok=True) + if config.use_wit() and file['file_path'].suffix != '.mp3': + Path(mp3_file_path).unlink(missing_ok=True) writer.write_all(Path(file['file_name']).stem, segments, config.output) @@ -218,7 +218,7 @@ def process_url( continue - file_path = os.path.join(config.output.output_dir, f"{element['id']}.wav") + file_path = os.path.join(config.output.output_dir, f"{element['id']}.mp3") if config.use_wit(): recognize_generator = WitRecognizer(verbose=config.input.verbose).recognize(file_path, config.wit) diff --git a/tafrigh/downloader.py b/tafrigh/downloader.py index 8ea80f0..a57e290 100644 --- a/tafrigh/downloader.py +++ b/tafrigh/downloader.py @@ -17,7 +17,8 @@ def _config(self, download_archive: Union[str, bool]) -> dict[str, Any]: return { 'quiet': True, 'verbose': False, - 'format': 'wav/bestaudio/best', + 'format': 'bestaudio', + 'extract_audio': True, 'outtmpl': os.path.join(self.output_dir, '%(id)s.%(ext)s'), 'ignoreerrors': True, 'download_archive': download_archive, @@ -25,7 +26,7 @@ def _config(self, download_archive: Union[str, bool]) -> dict[str, Any]: 'postprocessors': [ { 'key': 'FFmpegExtractAudio', - 'preferredcodec': 'wav', + 'preferredcodec': 'mp3', }, ], } diff --git a/tafrigh/recognizers/wit_recognizer.py b/tafrigh/recognizers/wit_recognizer.py index 3128d5e..2009891 100644 --- a/tafrigh/recognizers/wit_recognizer.py +++ b/tafrigh/recognizers/wit_recognizer.py @@ -115,8 +115,8 @@ def _process_segment( segment_file_path, start, end = segment - with open(segment_file_path, 'rb') as wav_file: - audio_content = wav_file.read() + with open(segment_file_path, 'rb') as mp3_file: + audio_content = mp3_file.read() retries = 5 @@ -127,7 +127,7 @@ def _process_segment( 'https://api.wit.ai/speech', headers={ 'Accept': 'application/vnd.wit.20200513+json', - 'Content-Type': 'audio/wav', + 'Content-Type': 'audio/mpeg3', 'Authorization': f'Bearer {wit_config.wit_client_access_tokens[wit_client_access_token_index]}', }, data=audio_content, diff --git a/tafrigh/utils/wit/file_utils.py b/tafrigh/utils/wit/file_utils.py index 37afb58..d087d3d 100644 --- a/tafrigh/utils/wit/file_utils.py +++ b/tafrigh/utils/wit/file_utils.py @@ -3,8 +3,8 @@ from pydub import AudioSegment -def convert_to_wav(file: Path) -> Path: +def convert_to_mp3(file: Path) -> Path: audio_file = AudioSegment.from_file(str(file)) - converted_file_path = file.with_suffix('.wav') - audio_file.export(str(converted_file_path), format='wav') + converted_file_path = file.with_suffix('.mp3') + audio_file.export(str(converted_file_path), format='mp3') return converted_file_path