You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Im trying to run a fairly simple map that is converting a dataset into numpy arrays. however, it just piles up on memory and doesnt write to disk. Ive tried multiple cache techniques such as specifying the cache dir, setting max mem, +++ but none seem to work. What am I missing here?
Steps to reproduce the bug
from pydub import AudioSegment
import io
import base64
import numpy as np
import os
CACHE_PATH = "/mnt/extdisk/cache" # "/root/.cache/huggingface/"#
os.environ["HF_HOME"] = CACHE_PATH
import datasets
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# Create a handler for Jupyter notebook
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
#datasets.config.IN_MEMORY_MAX_SIZE= 1000#*(2**30) #50 gb
print(datasets.config.HF_CACHE_HOME)
print(datasets.config.HF_DATASETS_CACHE)
# Decode the base64 string into bytes
def convert_mp3_to_audio_segment(example):
"""
example = ds['train'][0]
"""
try:
audio_data_bytes = base64.b64decode(example['audio'])
# Use pydub to load the MP3 audio from the decoded bytes
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data_bytes), format="mp3")
# Resample to 24_000
audio_segment = audio_segment.set_frame_rate(24_000)
audio = {'sampling_rate' : audio_segment.frame_rate,
'array' : np.array(audio_segment.get_array_of_samples(), dtype="float")}
del audio_segment
duration = len(audio['array']) / audio['sampling_rate']
except Exception as e:
logger.warning(f"Failed to convert audio for {example['id']}. Error: {e}")
audio = {'sampling_rate' : 0,
'array' : np.array([]), duration : 0}
return {'audio' : audio, 'duration' : duration}
ds = datasets.load_dataset("NbAiLab/nb_distil_speech_noconcat_stortinget", cache_dir=CACHE_PATH, keep_in_memory=False)
#%%
num_proc=32
ds_processed = (
ds
#.select(range(10))
.map(convert_mp3_to_audio_segment, num_proc=num_proc, desc="Converting mp3 to audio segment") #, cache_file_name=f"{CACHE_PATH}/stortinget_audio" # , cache_file_name="test"
)
Describe the bug
Im trying to run a fairly simple map that is converting a dataset into numpy arrays. however, it just piles up on memory and doesnt write to disk. Ive tried multiple cache techniques such as specifying the cache dir, setting max mem, +++ but none seem to work. What am I missing here?
Steps to reproduce the bug
Expected behavior
the map should write to disk
Environment info
datasets
version: 3.2.0huggingface_hub
version: 0.26.3fsspec
version: 2024.9.0The text was updated successfully, but these errors were encountered: