Skip to content

Commit

Permalink
Add/Mod: Options verify_files and zip_path in download script and upd…
Browse files Browse the repository at this point in the history
…ate error message for invalid checksum during MACS download. (#2)
  • Loading branch information
Labbeti committed Oct 23, 2023
1 parent 7599e11 commit 6a71578
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 3 deletions.
10 changes: 8 additions & 2 deletions src/aac_datasets/datasets/macs.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,10 @@ def _prepare_macs_dataset(
hash_value = file_info["hash_value"]
valid = validate_file(fpath, hash_value, hash_type="md5")
if not valid:
raise RuntimeError(f"Invalid checksum for file {fname}.")
raise RuntimeError(
f"Invalid checksum for file '{fname}'. (expected md5 checksum '{hash_value}')\n"
f"Please try to remove manually the file '{fpath}' and rerun MACS download."
)
elif verbose >= 2:
pylog.debug(f"File '{fname}' has a valid checksum.")

Expand Down Expand Up @@ -433,7 +436,10 @@ def _prepare_macs_dataset(
hash_value = file_info["hash_value"]
valid = validate_file(zip_fpath, hash_value, hash_type="md5")
if not valid:
raise RuntimeError(f"Invalid checksum for file {zip_fname}.")
raise RuntimeError(
f"Invalid checksum for file '{zip_fname}'. (expected md5 checksum '{hash_value}')\n"
f"Please try to remove manually the file '{zip_fpath}' and rerun MACS download."
)
elif verbose >= 2:
pylog.debug(f"File '{zip_fname}' has a valid checksum.")

Expand Down
20 changes: 19 additions & 1 deletion src/aac_datasets/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
get_default_root,
get_default_ffmpeg_path,
get_default_ytdl_path,
get_default_zip_path,
)


Expand Down Expand Up @@ -87,10 +88,12 @@ def download_macs(
force: bool = False,
download: bool = True,
clean_archives: bool = False,
verify_files: bool = True,
) -> Dict[str, MACS]:
"""Download :class:`~aac_datasets.datasets.macs.MACS` dataset."""
MACS.FORCE_PREPARE_DATA = force
MACS.CLEAN_ARCHIVES = clean_archives
MACS.VERIFY_FILES = verify_files

datasets = {}
for subset in MACSCard.SUBSETS:
Expand All @@ -107,6 +110,7 @@ def download_wavcaps(
subsets: Iterable[str] = WavCapsCard.SUBSETS,
hf_cache_dir: Optional[str] = HUGGINGFACE_HUB_CACHE,
revision: Optional[str] = WavCapsCard.DEFAULT_REVISION,
zip_path: str = ...,
) -> Dict[str, WavCaps]:
"""Download :class:`~aac_datasets.datasets.wavcaps.WavCaps` dataset."""

Expand All @@ -121,6 +125,7 @@ def download_wavcaps(
hf_cache_dir=hf_cache_dir,
revision=revision,
verbose=verbose,
zip_path=zip_path,
)
return datasets

Expand Down Expand Up @@ -166,7 +171,7 @@ def _get_main_download_args() -> Namespace:
"--ytdl_path",
type=str,
default=get_default_ytdl_path(),
help="Path to youtube-dl used to extract metadata from a youtube video.",
help="Path to yt-dl program used to extract metadata from a youtube video.",
)
audiocaps_subparser.add_argument(
"--with_tags",
Expand Down Expand Up @@ -213,6 +218,12 @@ def _get_main_download_args() -> Namespace:
default=False,
help="Remove archives files after extraction.",
)
macs_subparser.add_argument(
"--verify_files",
type=_str_to_bool,
default=True,
help="Verify if downloaded files have a valid checksum.",
)
# Note : MACS only have 1 subset, so we do not add MACS subsets arg

wavcaps_subparser = subparsers.add_parser(WavCapsCard.NAME)
Expand Down Expand Up @@ -242,6 +253,12 @@ def _get_main_download_args() -> Namespace:
default=WavCapsCard.DEFAULT_REVISION,
help="Revision of the WavCaps dataset.",
)
wavcaps_subparser.add_argument(
"--zip_path",
type=str,
default=get_default_zip_path(),
help="Path to zip executable to combine and extract WavCaps archives.",
)

args = parser.parse_args()
return args
Expand Down Expand Up @@ -296,6 +313,7 @@ def _main_download() -> None:
subsets=args.subsets,
hf_cache_dir=args.hf_cache_dir,
revision=args.revision,
zip_path=args.zip_path,
)

else:
Expand Down

0 comments on commit 6a71578

Please sign in to comment.