Mod: Update default max_workers to 1 to avoid too many request error …

…from youtube, and load a random item during check script.
Labbeti · Jan 5, 2024 · 0837668 · 0837668
1 parent dfcdeba
commit 0837668
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 23 deletions.
diff --git a/.github/workflows/python-package-pip.yaml b/.github/workflows/python-package-pip.yaml
@@ -87,7 +87,7 @@ jobs:
   
     - name: Try to download AudioCaps val
       run: |
-        aac-datasets-download --verbose 2 audiocaps --subsets val
+        aac-datasets-download --verbose 2 audiocaps --subsets val --max_workers 2 --with_tags true
   
     - name: Check data root
       run: |

diff --git a/src/aac_datasets/check.py b/src/aac_datasets/check.py
@@ -3,6 +3,7 @@
 
 import logging
 import os.path as osp
+import random
 
 from argparse import ArgumentParser, Namespace
 from typing import Dict, Iterable, Union
@@ -63,6 +64,10 @@ def check_directory(
         for subset in ds_class.CARD.SUBSETS:
             try:
                 ds = ds_class(root, subset, verbose=0)
+                if len(ds) > 0:
+                    # Try to load a random item
+                    idx = random.randint(0, len(ds) - 1)
+                    _item = ds[idx]
                 found_dsets[subset] = ds
 
             except RuntimeError:

diff --git a/src/aac_datasets/datasets/audiocaps.py b/src/aac_datasets/datasets/audiocaps.py
@@ -111,7 +111,7 @@ def __init__(
         exclude_removed_audio: bool = True,
         ffmpeg_path: Union[str, Path, None] = None,
         flat_captions: bool = False,
-        max_workers: Optional[int] = None,
+        max_workers: Optional[int] = 1,
         sr: int = 32_000,
         with_tags: bool = False,
         ytdlp_path: Union[str, Path, None] = None,
@@ -148,9 +148,10 @@ def __init__(
             defaults to "ffmpeg".
         :param flat_captions: If True, map captions to audio instead of audio to caption.
             defaults to True.
-        :param max_workers: Number of thread given to ThreadPoolExecutor during download.
-            The value None will use `min(32, os.cpu_count() + 4)` workers.
-            defaults to None.
+        :param max_workers: Number of threads to download audio files in parallel.
+            Do not use a value too high to avoid "Too Many Requests" error.
+            The value None will use `min(32, os.cpu_count() + 4)` workers, which is the default of ThreadPoolExecutor.
+            defaults to 1.
         :param sr: The sample rate used for audio files in the dataset (in Hz).
             Since original YouTube videos are recorded in various settings, this parameter allow to download allow audio files with a specific sample rate.
             defaults to 32000.
@@ -177,14 +178,14 @@ def __init__(
                 subset=subset,
                 force=force_download,
                 verbose=verbose,
+                verify_files=verify_files,
                 audio_duration=audio_duration,
                 audio_format=audio_format,
                 audio_n_channels=audio_n_channels,
                 download_audio=download_audio,
                 ffmpeg_path=ffmpeg_path,
                 max_workers=max_workers,
                 sr=sr,
-                verify_files=verify_files,
                 with_tags=with_tags,
                 ytdlp_path=ytdlp_path,
             )

diff --git a/src/aac_datasets/datasets/functional/audiocaps.py b/src/aac_datasets/datasets/functional/audiocaps.py
@@ -162,7 +162,7 @@ def load_audiocaps_dataset(
 
     # Build global mappings
     fnames_dic = dict.fromkeys(
-        f"{line['youtube_id']}_{line['start_time']}.{audio_format}"
+        _AUDIO_FNAME_FORMAT.format(**line, audio_format=audio_format)
         for line in captions_data
     )
     audio_fnames_on_disk = dict.fromkeys(os.listdir(audio_subset_dpath))
@@ -192,7 +192,7 @@ def load_audiocaps_dataset(
         start_time = line["start_time"]
         caption = line["caption"]
 
-        fname = f"{youtube_id}_{start_time}.{audio_format}"
+        fname = _AUDIO_FNAME_FORMAT.format(**line, audio_format=audio_format)
         if fname in fname_to_idx:
             idx = fname_to_idx[fname]
 
@@ -220,7 +220,9 @@ def load_audiocaps_dataset(
         youtube_id = line["YTID"]
         # Note : In audioset, start_time is a string repr of a float value, audiocaps it is a string repr of an integer
         start_time = int(float(line["start_seconds"]))
-        fname = f"{youtube_id}_{start_time}.{audio_format}"
+        fname = _AUDIO_FNAME_FORMAT.format(
+            youtube_id=youtube_id, start_time=start_time, audio_format=audio_format
+        )
         if fname in fname_to_idx:
             tags_mid = line["positive_labels"]
             tags_mid = tags_mid.split(",")
@@ -263,7 +265,7 @@ def download_audiocaps_dataset(
     audio_n_channels: int = 1,
     download_audio: bool = True,
     ffmpeg_path: Union[str, Path, None] = None,
-    max_workers: Optional[int] = None,
+    max_workers: Optional[int] = 1,
     sr: int = 32_000,
     ytdlp_path: Union[str, Path, None] = None,
     with_tags: bool = False,
@@ -292,6 +294,10 @@ def download_audiocaps_dataset(
         defaults to True.
     :param ffmpeg_path: Path to ffmpeg executable file.
         defaults to "ffmpeg".
+    :param max_workers: Number of threads to download audio files in parallel.
+        Do not use a value too high to avoid "Too Many Requests" error.
+        The value None will use `min(32, os.cpu_count() + 4)` workers, which is the default of ThreadPoolExecutor.
+        defaults to 1.
     :param sr: The sample rate used for audio files in the dataset (in Hz).
         Since original YouTube videos are recorded in various settings, this parameter allow to download allow audio files with a specific sample rate.
         defaults to 32000.
@@ -355,7 +361,9 @@ def _cast_line(line: Dict[str, Any], audio_format: str) -> Dict[str, Any]:
                 )
 
             start_time = int(start_time)
-            fname = f"{youtube_id}_{start_time}.{audio_format}"
+            fname = _AUDIO_FNAME_FORMAT.format(
+                youtube_id=youtube_id, start_time=start_time, audio_format=audio_format
+            )
 
             line.update({"start_time": start_time, "fname": fname})
             return line
@@ -450,16 +458,17 @@ def download_audiocaps_datasets(
     subsets: Union[str, Iterable[str]] = AudioCapsCard.DEFAULT_SUBSET,
     force: bool = False,
     verbose: int = 0,
+    verify_files: bool = False,
     # AudioCaps-specific args
     audio_duration: float = 10.0,
     audio_format: str = "flac",
     audio_n_channels: int = 1,
     download_audio: bool = True,
     ffmpeg_path: Union[str, Path, None] = None,
+    max_workers: Optional[int] = 1,
     sr: int = 32_000,
-    verify_files: bool = False,
-    ytdlp_path: Union[str, Path, None] = None,
     with_tags: bool = False,
+    ytdlp_path: Union[str, Path, None] = None,
 ) -> None:
     """Function helper to download a list of subsets. See :func:`~aac_datasets.datasets.functional.audiocaps.download_audiocaps_dataset` for details."""
     if isinstance(subsets, str):
@@ -471,15 +480,16 @@ def download_audiocaps_datasets(
         root=root,
         force=force,
         verbose=verbose,
+        verify_files=verify_files,
         audio_duration=audio_duration,
         audio_format=audio_format,
         audio_n_channels=audio_n_channels,
         download_audio=download_audio,
         ffmpeg_path=ffmpeg_path,
+        max_workers=max_workers,
         sr=sr,
-        verify_files=verify_files,
-        ytdlp_path=ytdlp_path,
         with_tags=with_tags,
+        ytdlp_path=ytdlp_path,
     )
     for subset in subsets:
         download_audiocaps_dataset(
@@ -539,11 +549,11 @@ def _is_prepared_audiocaps(
         msgs.append(f"Cannot find directory '{audio_subset_dpath}'.")
     else:
         audio_fnames = os.listdir(audio_subset_dpath)
-        audio_fnames = [
-            fname for fname in audio_fnames if fname.endswith(f".{audio_format}")
-        ]
+        audio_fnames = [fname for fname in audio_fnames if fname.endswith(audio_format)]
         if len(audio_fnames) == 0:
-            msgs.append(f"Cannot find any audio file in '{audio_subset_dpath}'.")
+            msgs.append(
+                f"Cannot find any audio {audio_format} file in '{audio_subset_dpath}'."
+            )
 
     if not osp.isfile(captions_fpath):
         msgs.append(f"Cannot find file '{captions_fpath}'.")
@@ -814,3 +824,6 @@ def _get_youtube_link_embed(
         "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv",
     },
 }
+
+# Audio filename format for AudioCaps
+_AUDIO_FNAME_FORMAT = "{youtube_id}_{start_time}.{audio_format}"
diff --git a/src/aac_datasets/download.py b/src/aac_datasets/download.py
@@ -25,7 +25,12 @@
     WavCapsCard,
     download_wavcaps_datasets,
 )
-from aac_datasets.utils.cmdline import _str_to_bool, _str_to_opt_str, _setup_logging
+from aac_datasets.utils.cmdline import (
+    _str_to_bool,
+    _str_to_opt_int,
+    _str_to_opt_str,
+    _setup_logging,
+)
 from aac_datasets.utils.globals import (
     get_default_root,
     get_default_ffmpeg_path,
@@ -94,6 +99,12 @@ def _get_main_download_args() -> Namespace:
         choices=AudioCapsCard.SUBSETS,
         help="AudioCaps subsets to download.",
     )
+    audiocaps_subparser.add_argument(
+        "--max_workers",
+        type=_str_to_opt_int,
+        default=1,
+        help="Number of workers used for downloading multiple files in parallel.",
+    )
 
     clotho_subparser = subparsers.add_parser(ClothoCard.NAME)
     clotho_subparser.add_argument(
@@ -185,8 +196,9 @@ def _main_download() -> None:
             force=args.force,
             verbose=args.verbose,
             ffmpeg_path=args.ffmpeg_path,
-            ytdlp_path=args.ytdlp_path,
+            max_workers=args.max_workers,
             with_tags=args.with_tags,
+            ytdlp_path=args.ytdlp_path,
         )
 
     elif args.dataset == ClothoCard.NAME:

diff --git a/src/aac_datasets/utils/cmdline.py b/src/aac_datasets/utils/cmdline.py
@@ -7,8 +7,8 @@
 from typing import Optional
 
 
-_TRUE_VALUES = ("true", "1", "t", "yes", "y")
-_FALSE_VALUES = ("false", "0", "f", "no", "n")
+_TRUE_VALUES = ("true", "t", "yes", "y", "1")
+_FALSE_VALUES = ("false", "f", "no", "n", "0")
 
 
 def _str_to_bool(s: str) -> bool:
@@ -23,6 +23,14 @@ def _str_to_bool(s: str) -> bool:
         )
 
 
+def _str_to_opt_int(s: str) -> Optional[int]:
+    s = str(s).strip().lower()
+    if s == "none":
+        return None
+    else:
+        return int(s)
+
+
 def _str_to_opt_str(s: str) -> Optional[str]:
     s = str(s)
     if s.lower() == "none":