Skip to content

Commit

Permalink
Mod: Update default max_workers to 1 to avoid too many request error …
Browse files Browse the repository at this point in the history
…from youtube, and load a random item during check script.
  • Loading branch information
Labbeti committed Jan 5, 2024
1 parent dfcdeba commit 0837668
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 23 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package-pip.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ jobs:
- name: Try to download AudioCaps val
run: |
aac-datasets-download --verbose 2 audiocaps --subsets val
aac-datasets-download --verbose 2 audiocaps --subsets val --max_workers 2 --with_tags true
- name: Check data root
run: |
Expand Down
5 changes: 5 additions & 0 deletions src/aac_datasets/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import logging
import os.path as osp
import random

from argparse import ArgumentParser, Namespace
from typing import Dict, Iterable, Union
Expand Down Expand Up @@ -63,6 +64,10 @@ def check_directory(
for subset in ds_class.CARD.SUBSETS:
try:
ds = ds_class(root, subset, verbose=0)
if len(ds) > 0:
# Try to load a random item
idx = random.randint(0, len(ds) - 1)
_item = ds[idx]
found_dsets[subset] = ds

except RuntimeError:
Expand Down
11 changes: 6 additions & 5 deletions src/aac_datasets/datasets/audiocaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def __init__(
exclude_removed_audio: bool = True,
ffmpeg_path: Union[str, Path, None] = None,
flat_captions: bool = False,
max_workers: Optional[int] = None,
max_workers: Optional[int] = 1,
sr: int = 32_000,
with_tags: bool = False,
ytdlp_path: Union[str, Path, None] = None,
Expand Down Expand Up @@ -148,9 +148,10 @@ def __init__(
defaults to "ffmpeg".
:param flat_captions: If True, map captions to audio instead of audio to caption.
defaults to True.
:param max_workers: Number of thread given to ThreadPoolExecutor during download.
The value None will use `min(32, os.cpu_count() + 4)` workers.
defaults to None.
:param max_workers: Number of threads to download audio files in parallel.
Do not use a value too high to avoid "Too Many Requests" error.
The value None will use `min(32, os.cpu_count() + 4)` workers, which is the default of ThreadPoolExecutor.
defaults to 1.
:param sr: The sample rate used for audio files in the dataset (in Hz).
Since original YouTube videos are recorded in various settings, this parameter allow to download allow audio files with a specific sample rate.
defaults to 32000.
Expand All @@ -177,14 +178,14 @@ def __init__(
subset=subset,
force=force_download,
verbose=verbose,
verify_files=verify_files,
audio_duration=audio_duration,
audio_format=audio_format,
audio_n_channels=audio_n_channels,
download_audio=download_audio,
ffmpeg_path=ffmpeg_path,
max_workers=max_workers,
sr=sr,
verify_files=verify_files,
with_tags=with_tags,
ytdlp_path=ytdlp_path,
)
Expand Down
39 changes: 26 additions & 13 deletions src/aac_datasets/datasets/functional/audiocaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def load_audiocaps_dataset(

# Build global mappings
fnames_dic = dict.fromkeys(
f"{line['youtube_id']}_{line['start_time']}.{audio_format}"
_AUDIO_FNAME_FORMAT.format(**line, audio_format=audio_format)
for line in captions_data
)
audio_fnames_on_disk = dict.fromkeys(os.listdir(audio_subset_dpath))
Expand Down Expand Up @@ -192,7 +192,7 @@ def load_audiocaps_dataset(
start_time = line["start_time"]
caption = line["caption"]

fname = f"{youtube_id}_{start_time}.{audio_format}"
fname = _AUDIO_FNAME_FORMAT.format(**line, audio_format=audio_format)
if fname in fname_to_idx:
idx = fname_to_idx[fname]

Expand Down Expand Up @@ -220,7 +220,9 @@ def load_audiocaps_dataset(
youtube_id = line["YTID"]
# Note : In audioset, start_time is a string repr of a float value, audiocaps it is a string repr of an integer
start_time = int(float(line["start_seconds"]))
fname = f"{youtube_id}_{start_time}.{audio_format}"
fname = _AUDIO_FNAME_FORMAT.format(
youtube_id=youtube_id, start_time=start_time, audio_format=audio_format
)
if fname in fname_to_idx:
tags_mid = line["positive_labels"]
tags_mid = tags_mid.split(",")
Expand Down Expand Up @@ -263,7 +265,7 @@ def download_audiocaps_dataset(
audio_n_channels: int = 1,
download_audio: bool = True,
ffmpeg_path: Union[str, Path, None] = None,
max_workers: Optional[int] = None,
max_workers: Optional[int] = 1,
sr: int = 32_000,
ytdlp_path: Union[str, Path, None] = None,
with_tags: bool = False,
Expand Down Expand Up @@ -292,6 +294,10 @@ def download_audiocaps_dataset(
defaults to True.
:param ffmpeg_path: Path to ffmpeg executable file.
defaults to "ffmpeg".
:param max_workers: Number of threads to download audio files in parallel.
Do not use a value too high to avoid "Too Many Requests" error.
The value None will use `min(32, os.cpu_count() + 4)` workers, which is the default of ThreadPoolExecutor.
defaults to 1.
:param sr: The sample rate used for audio files in the dataset (in Hz).
Since original YouTube videos are recorded in various settings, this parameter allow to download allow audio files with a specific sample rate.
defaults to 32000.
Expand Down Expand Up @@ -355,7 +361,9 @@ def _cast_line(line: Dict[str, Any], audio_format: str) -> Dict[str, Any]:
)

start_time = int(start_time)
fname = f"{youtube_id}_{start_time}.{audio_format}"
fname = _AUDIO_FNAME_FORMAT.format(
youtube_id=youtube_id, start_time=start_time, audio_format=audio_format
)

line.update({"start_time": start_time, "fname": fname})
return line
Expand Down Expand Up @@ -450,16 +458,17 @@ def download_audiocaps_datasets(
subsets: Union[str, Iterable[str]] = AudioCapsCard.DEFAULT_SUBSET,
force: bool = False,
verbose: int = 0,
verify_files: bool = False,
# AudioCaps-specific args
audio_duration: float = 10.0,
audio_format: str = "flac",
audio_n_channels: int = 1,
download_audio: bool = True,
ffmpeg_path: Union[str, Path, None] = None,
max_workers: Optional[int] = 1,
sr: int = 32_000,
verify_files: bool = False,
ytdlp_path: Union[str, Path, None] = None,
with_tags: bool = False,
ytdlp_path: Union[str, Path, None] = None,
) -> None:
"""Function helper to download a list of subsets. See :func:`~aac_datasets.datasets.functional.audiocaps.download_audiocaps_dataset` for details."""
if isinstance(subsets, str):
Expand All @@ -471,15 +480,16 @@ def download_audiocaps_datasets(
root=root,
force=force,
verbose=verbose,
verify_files=verify_files,
audio_duration=audio_duration,
audio_format=audio_format,
audio_n_channels=audio_n_channels,
download_audio=download_audio,
ffmpeg_path=ffmpeg_path,
max_workers=max_workers,
sr=sr,
verify_files=verify_files,
ytdlp_path=ytdlp_path,
with_tags=with_tags,
ytdlp_path=ytdlp_path,
)
for subset in subsets:
download_audiocaps_dataset(
Expand Down Expand Up @@ -539,11 +549,11 @@ def _is_prepared_audiocaps(
msgs.append(f"Cannot find directory '{audio_subset_dpath}'.")
else:
audio_fnames = os.listdir(audio_subset_dpath)
audio_fnames = [
fname for fname in audio_fnames if fname.endswith(f".{audio_format}")
]
audio_fnames = [fname for fname in audio_fnames if fname.endswith(audio_format)]
if len(audio_fnames) == 0:
msgs.append(f"Cannot find any audio file in '{audio_subset_dpath}'.")
msgs.append(
f"Cannot find any audio {audio_format} file in '{audio_subset_dpath}'."
)

if not osp.isfile(captions_fpath):
msgs.append(f"Cannot find file '{captions_fpath}'.")
Expand Down Expand Up @@ -814,3 +824,6 @@ def _get_youtube_link_embed(
"url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv",
},
}

# Audio filename format for AudioCaps
_AUDIO_FNAME_FORMAT = "{youtube_id}_{start_time}.{audio_format}"
16 changes: 14 additions & 2 deletions src/aac_datasets/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@
WavCapsCard,
download_wavcaps_datasets,
)
from aac_datasets.utils.cmdline import _str_to_bool, _str_to_opt_str, _setup_logging
from aac_datasets.utils.cmdline import (
_str_to_bool,
_str_to_opt_int,
_str_to_opt_str,
_setup_logging,
)
from aac_datasets.utils.globals import (
get_default_root,
get_default_ffmpeg_path,
Expand Down Expand Up @@ -94,6 +99,12 @@ def _get_main_download_args() -> Namespace:
choices=AudioCapsCard.SUBSETS,
help="AudioCaps subsets to download.",
)
audiocaps_subparser.add_argument(
"--max_workers",
type=_str_to_opt_int,
default=1,
help="Number of workers used for downloading multiple files in parallel.",
)

clotho_subparser = subparsers.add_parser(ClothoCard.NAME)
clotho_subparser.add_argument(
Expand Down Expand Up @@ -185,8 +196,9 @@ def _main_download() -> None:
force=args.force,
verbose=args.verbose,
ffmpeg_path=args.ffmpeg_path,
ytdlp_path=args.ytdlp_path,
max_workers=args.max_workers,
with_tags=args.with_tags,
ytdlp_path=args.ytdlp_path,
)

elif args.dataset == ClothoCard.NAME:
Expand Down
12 changes: 10 additions & 2 deletions src/aac_datasets/utils/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from typing import Optional


_TRUE_VALUES = ("true", "1", "t", "yes", "y")
_FALSE_VALUES = ("false", "0", "f", "no", "n")
_TRUE_VALUES = ("true", "t", "yes", "y", "1")
_FALSE_VALUES = ("false", "f", "no", "n", "0")


def _str_to_bool(s: str) -> bool:
Expand All @@ -23,6 +23,14 @@ def _str_to_bool(s: str) -> bool:
)


def _str_to_opt_int(s: str) -> Optional[int]:
s = str(s).strip().lower()
if s == "none":
return None
else:
return int(s)


def _str_to_opt_str(s: str) -> Optional[str]:
s = str(s)
if s.lower() == "none":
Expand Down

0 comments on commit 0837668

Please sign in to comment.