From 2874d97924063c50a303c8c24bccb21ac9979c58 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Mon, 24 Jun 2024 16:30:15 +0200 Subject: [PATCH 01/13] Made get_releases() return non-available releases --- darwin/dataset/remote_dataset_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py index 42607a85b..c15d41e24 100644 --- a/darwin/dataset/remote_dataset_v2.py +++ b/darwin/dataset/remote_dataset_v2.py @@ -136,7 +136,7 @@ def get_releases(self) -> List["Release"]: for payload in releases_json ] return sorted( - filter(lambda x: x.available, releases), + releases, key=lambda x: x.version, reverse=True, ) From 483f75cd4115f1ab54ed16d8edb7b76df036a1f0 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Wed, 26 Jun 2024 18:36:15 +0200 Subject: [PATCH 02/13] Added status to the 'Release' class --- darwin/dataset/release.py | 9 +++++++++ darwin/dataset/remote_dataset.py | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/darwin/dataset/release.py b/darwin/dataset/release.py index a8ce9e92d..ee3b8e763 100644 --- a/darwin/dataset/release.py +++ b/darwin/dataset/release.py @@ -4,6 +4,7 @@ from typing import Any, Dict, Optional import requests + from darwin.dataset.identifier import DatasetIdentifier @@ -22,6 +23,8 @@ class Release: The version of the ``Release``. name : str The name of the ``Release``. + status : str + The status of the ``Release``. url : Optional[str] The full url used to download the ``Release``. export_date : datetime.datetime @@ -47,6 +50,8 @@ class Release: The version of the ``Release``. name : str The name of the ``Release``. + status : str + The status of the ``Release``. url : Optional[str] The full url used to download the ``Release``. export_date : datetime.datetime @@ -69,6 +74,7 @@ def __init__( team_slug: str, version: str, name: str, + status: str, url: Optional[str], export_date: datetime.datetime, image_count: Optional[int], @@ -81,6 +87,7 @@ def __init__( self.team_slug = team_slug self.version = version self.name = name + self.status = status self.url = url self.export_date = export_date self.image_count = image_count @@ -155,6 +162,7 @@ def parse_json( team_slug=team_slug, version=payload["version"], name=payload["name"], + status=payload["status"], export_date=export_date, url=None, available=False, @@ -169,6 +177,7 @@ def parse_json( team_slug=team_slug, version=payload["version"], name=payload["name"], + status=payload["status"], image_count=payload["metadata"]["num_images"], class_count=len(payload["metadata"]["annotation_classes"]), export_date=export_date, diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index da517ae43..886b2df5f 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -261,6 +261,13 @@ def pull( if release.format != "json" and release.format != "darwin_json_2": raise UnsupportedExportFormat(release.format) + # Check if the release is ready + if release.status == "processing": + if retry: + pass # Implment retry logic + else: + pass # Raise error letting user know that the release is still processing and that they can use retry if they want to poll + release_dir = self.local_releases_path / release.name release_dir.mkdir(parents=True, exist_ok=True) From 69890a2cafc5124dac22b1e50eebd5453595069d Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Wed, 26 Jun 2024 18:51:02 +0200 Subject: [PATCH 03/13] Implemented retry logic --- darwin/dataset/remote_dataset.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 886b2df5f..4c2410aff 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -1,6 +1,7 @@ import os import shutil import tempfile +import time import zipfile from datetime import datetime from pathlib import Path @@ -207,6 +208,7 @@ def pull( video_frames: bool = False, force_slots: bool = False, ignore_slots: bool = False, + retry: bool = False, ) -> Tuple[Optional[Callable[[], Iterator[Any]]], int]: """ Downloads a remote dataset (images and annotations) to the datasets directory. @@ -237,6 +239,8 @@ def pull( Pulls video frames images instead of video files. force_slots: bool Pulls all slots of items into deeper file structure ({prefix}/{item_name}/{slot_name}/{file_name}) + retry: bool + If True, will repeatedly try to download the release if it is still processing up to a maximum of 5 minutes. Returns ------- @@ -251,6 +255,8 @@ def pull( If the given ``release`` has an invalid format. ValueError If darwin in unable to get ``Team`` configuration. + ValueError + If the release is still processing after the maximum retry duration. """ console = self.console or Console() @@ -261,12 +267,26 @@ def pull( if release.format != "json" and release.format != "darwin_json_2": raise UnsupportedExportFormat(release.format) - # Check if the release is ready - if release.status == "processing": + if release.status == "pending": if retry: - pass # Implment retry logic + retry_duration = 300 + retry_interval = 10 + while release.status == "pending" and retry_duration > 0: + console.print( + f"Release {release.name} for dataset {self.name} is still processing. Retrying in 10 seconds... {retry_duration}s left before timeout." + ) + time.sleep(retry_interval) + retry_duration -= retry_interval + release = self.get_release(release.name) + if release.status == "processing": + raise ValueError( + f"Release {release.name} is still processing after multiple retries. Please try again later." + ) else: - pass # Raise error letting user know that the release is still processing and that they can use retry if they want to poll + raise ValueError( + f"Release {release.name} is still processing. Please wait for it to be ready.\n\n If you would like to automatically retry, set the `retry` parameter to `True` with the SDK, or use the `--retry` flag with the CLI." + ) + console.print("Release is ready for download. Starting download...") release_dir = self.local_releases_path / release.name release_dir.mkdir(parents=True, exist_ok=True) From 98b603cf4b0e6c909dc4aa3b6c8c7623218c3fbd Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Wed, 26 Jun 2024 18:57:20 +0200 Subject: [PATCH 04/13] Fix tests --- tests/darwin/dataset/release_test.py | 1 + tests/darwin/dataset/remote_dataset_test.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/tests/darwin/dataset/release_test.py b/tests/darwin/dataset/release_test.py index a90fc0dc4..790a84c3f 100644 --- a/tests/darwin/dataset/release_test.py +++ b/tests/darwin/dataset/release_test.py @@ -16,6 +16,7 @@ def release(dataset_slug: str, team_slug_darwin_json_v2: str) -> Release: team_slug=team_slug_darwin_json_v2, version="latest", name="test", + status="test_status", url="http://test.v7labs.com/", export_date="now", image_count=None, diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index d94b7209d..630d199cb 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -662,6 +662,7 @@ def test_gets_latest_release_when_not_given_one( "team-slug", "0.1.0", "release-name", + "release-status", "http://darwin-fake-url.com", datetime.now(), None, @@ -692,6 +693,7 @@ def test_does_not_create_symlink_on_windows( "team-slug", "0.1.0", "release-name", + "release-status", "http://darwin-fake-url.com", datetime.now(), None, @@ -724,6 +726,7 @@ def test_continues_if_symlink_creation_fails( "team-slug", "0.1.0", "release-name", + "release-status", "http://darwin-fake-url.com", datetime.now(), None, @@ -758,6 +761,7 @@ def test_raises_if_release_format_is_not_json( remote_dataset.team, "0.1.0", "release-name", + "release-status", "http://darwin-fake-url.com", datetime.now(), None, @@ -779,6 +783,7 @@ def test_moves_properties_metadata_file( "team-slug", "0.1.0", "release-name", + "release-status", "http://darwin-fake-url.com", datetime.now(), None, From aca028004ae5dcce2f9a46a26d63dba2180f21da Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Fri, 28 Jun 2024 09:29:38 +0100 Subject: [PATCH 05/13] Added --retry CLI flag & improved error messaging --- darwin/cli.py | 1 + darwin/cli_functions.py | 10 +++++++- darwin/dataset/remote_dataset.py | 38 +++++++++++++++++++++-------- darwin/dataset/remote_dataset_v2.py | 25 ++++++++++++++----- darwin/options.py | 5 ++++ 5 files changed, 62 insertions(+), 17 deletions(-) diff --git a/darwin/cli.py b/darwin/cli.py index 003f20802..ce5b549cb 100644 --- a/darwin/cli.py +++ b/darwin/cli.py @@ -155,6 +155,7 @@ def _run(args: Namespace, parser: ArgumentParser) -> None: args.video_frames, args.force_slots, args.ignore_slots, + args.retry, ) elif args.action == "import": f.dataset_import( diff --git a/darwin/cli_functions.py b/darwin/cli_functions.py index efde7456f..0b136ea57 100644 --- a/darwin/cli_functions.py +++ b/darwin/cli_functions.py @@ -410,6 +410,7 @@ def pull_dataset( video_frames: bool = False, force_slots: bool = False, ignore_slots: bool = False, + retry: bool = False, ) -> None: """ Downloads a remote dataset (images and annotations) in the datasets directory. @@ -428,8 +429,14 @@ def pull_dataset( Pulls video frames images instead of video files. Defaults to False. force_slots: bool Pulls all slots of items into deeper file structure ({prefix}/{item_name}/{slot_name}/{file_name}) + retry: bool + If True, will repeatedly try to download the release if it is still processing up to a maximum of 5 minutes. """ version: str = DatasetIdentifier.parse(dataset_slug).version or "latest" + if version == "latest" and retry: + raise ValueError( + "To retry downloading a release, a release name must be provided. This can be done as follows:\n\ndarwin dataset pull team-slug/dataset-slug:release-name" + ) client: Client = _load_client(offline=False, maybe_guest=True) try: dataset: RemoteDataset = client.get_remote_dataset( @@ -444,7 +451,7 @@ def pull_dataset( _error("please re-authenticate") try: - release: Release = dataset.get_release(version) + release: Release = dataset.get_release(version, retry) dataset.pull( release=release, only_annotations=only_annotations, @@ -452,6 +459,7 @@ def pull_dataset( video_frames=video_frames, force_slots=force_slots, ignore_slots=ignore_slots, + retry=retry, ) print_new_version_info(client) except NotFound: diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 4c2410aff..3375ccbf8 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -262,7 +262,12 @@ def pull( console = self.console or Console() if release is None: - release = self.get_release() + if retry: + raise ValueError( + "To retry downloading a release, a release name must be provided. This can be done as follows:\n\nrelease = dataset.get_release(name='release_name')\ndataset.pull(release=release, retry=True)" + ) + else: + release = self.get_release(retry=retry) if release.format != "json" and release.format != "darwin_json_2": raise UnsupportedExportFormat(release.format) @@ -273,20 +278,22 @@ def pull( retry_interval = 10 while release.status == "pending" and retry_duration > 0: console.print( - f"Release {release.name} for dataset {self.name} is still processing. Retrying in 10 seconds... {retry_duration}s left before timeout." + f"Release '{release.name}' for dataset '{self.name}' is still processing. Retrying in {retry_interval} seconds... {retry_duration} seconds left before timeout." ) time.sleep(retry_interval) retry_duration -= retry_interval release = self.get_release(release.name) - if release.status == "processing": + if release.status == "pending": raise ValueError( - f"Release {release.name} is still processing after multiple retries. Please try again later." + f"Release {release.name} is still processing after {retry_interval} seconds. Please try again later." ) else: raise ValueError( f"Release {release.name} is still processing. Please wait for it to be ready.\n\n If you would like to automatically retry, set the `retry` parameter to `True` with the SDK, or use the `--retry` flag with the CLI." ) - console.print("Release is ready for download. Starting download...") + console.print( + f"Release '{release.name}' for dataset '{self.name}' is ready for download. Starting download..." + ) release_dir = self.local_releases_path / release.name release_dir.mkdir(parents=True, exist_ok=True) @@ -742,17 +749,22 @@ def get_report(self, granularity: str = "day") -> str: """ @abstractmethod - def get_releases(self) -> List["Release"]: + def get_releases(self, retry: bool = False) -> List["Release"]: """ Get a sorted list of releases with the most recent first. + Parameters + ---------- + retry : bool, default: False + If True, return all releases, including those that are not available. + Returns ------- List["Release"] Returns a sorted list of available ``Release``\\s with the most recent first. """ - def get_release(self, name: str = "latest") -> "Release": + def get_release(self, name: str = "latest", retry: bool = True) -> "Release": """ Get a specific ``Release`` for this ``RemoteDataset``. @@ -760,6 +772,8 @@ def get_release(self, name: str = "latest") -> "Release": ---------- name : str, default: "latest" Name of the export. + retry : bool, default: False + If True, return all releases, including those that are not available. Returns ------- @@ -771,9 +785,13 @@ def get_release(self, name: str = "latest") -> "Release": NotFound The selected ``Release`` does not exist. """ - releases = self.get_releases() + releases = self.get_releases(retry) if not releases: - raise NotFound(str(self.identifier)) + raise NotFound( + str( + f"No releases found for dataset '{self.name}'. Please create an export of this dataset first." + ) + ) # overwrite default name with stored dataset.release if supplied if self.release and name == "latest": @@ -786,7 +804,7 @@ def get_release(self, name: str = "latest") -> "Release": return release raise NotFound( str( - f"Release name {name} not found in dataset {self.name}. Please check this release exists for this dataset." + f"Release name '{name}' not found in dataset '{self.name}'. Please check this release exists for this dataset." ) ) diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py index c15d41e24..abf101690 100644 --- a/darwin/dataset/remote_dataset_v2.py +++ b/darwin/dataset/remote_dataset_v2.py @@ -115,10 +115,15 @@ def __init__( version=2, ) - def get_releases(self) -> List["Release"]: + def get_releases(self, retry: bool = False) -> List["Release"]: """ Get a sorted list of releases with the most recent first. + Parameters + ---------- + retry : bool, default: False + If True, return all releases, including those that are not available. + Returns ------- List["Release"] @@ -135,11 +140,19 @@ def get_releases(self) -> List["Release"]: Release.parse_json(self.slug, self.team, payload) for payload in releases_json ] - return sorted( - releases, - key=lambda x: x.version, - reverse=True, - ) + + if retry: + return sorted( + releases, + key=lambda x: x.version, + reverse=True, + ) + else: + return sorted( + filter(lambda x: x.available, releases), + key=lambda x: x.version, + reverse=True, + ) def push( self, diff --git a/darwin/options.py b/darwin/options.py index db837bb7f..62aa519fd 100644 --- a/darwin/options.py +++ b/darwin/options.py @@ -277,6 +277,11 @@ def __init__(self) -> None: action="store_true", help="Pulls video frame images instead of video files.", ) + parser_pull.add_argument( + "--retry", + action="store_true", + help="Repeatedly try to download the release if it is still processing. Times out after 5 minutes.", + ) slots_group = parser_pull.add_mutually_exclusive_group() slots_group.add_argument( "--force-slots", From 2dea7c0cb502940a8cb73240930404ae4e86a8de Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Fri, 28 Jun 2024 10:42:13 +0100 Subject: [PATCH 06/13] Improved error messaging --- darwin/dataset/remote_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 3375ccbf8..f956c59f2 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -285,11 +285,11 @@ def pull( release = self.get_release(release.name) if release.status == "pending": raise ValueError( - f"Release {release.name} is still processing after {retry_interval} seconds. Please try again later." + f"Release {release.name} for dataset '{self.name}' is still processing after {retry_interval} seconds. Please try again later." ) else: raise ValueError( - f"Release {release.name} is still processing. Please wait for it to be ready.\n\n If you would like to automatically retry, set the `retry` parameter to `True` with the SDK, or use the `--retry` flag with the CLI." + f"Release '{release.name}' for dataset '{self.name}' is still processing. Please wait for it to be ready.\n\n If you would like to automatically retry, set the `retry` parameter to `True` with the SDK, or use the `--retry` flag with the CLI." ) console.print( f"Release '{release.name}' for dataset '{self.name}' is ready for download. Starting download..." From 2c623d0fab3402a2fc6f24f9286a34dbc4a51f2b Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Fri, 28 Jun 2024 15:11:06 +0100 Subject: [PATCH 07/13] Unit tests --- tests/darwin/dataset/remote_dataset_test.py | 43 +++++++++++++++++ tests/fixtures.py | 51 ++++++++++++++++++++- 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index 630d199cb..a4d88adc9 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -813,6 +813,27 @@ def fake_download_zip(self, path): ) assert metadata_path.exists() + def test_pull_raises_value_error_when_retry_is_true_and_release_is_none( + self, remote_dataset + ): + with pytest.raises(ValueError): + remote_dataset.pull(release=None, retry=True) + + @patch("time.sleep", return_value=None) + def test_num_retries(self, mock_sleep, remote_dataset, pending_release): + with patch.object(remote_dataset, "get_release", return_value=pending_release): + with pytest.raises(ValueError): + remote_dataset.pull(release=pending_release, retry=True) + assert mock_sleep.call_count == 30 # 300 seconds / 10 seconds interval + + @patch("time.sleep", return_value=None) + def test_raises_after_max_retry_duration( + self, mock_sleep, remote_dataset, pending_release + ): + with patch.object(remote_dataset, "get_release", return_value=pending_release): + with pytest.raises(ValueError, match="is still processing after"): + remote_dataset.pull(release=pending_release, retry=True) + class TestPullNamingConvention: def _test_pull_naming_convention( @@ -1321,3 +1342,25 @@ def test_register_files_with_blocked_items(self, remote_dataset: RemoteDatasetV2 ) assert len(result["registered"]) == 0 assert len(result["blocked"]) == 1 + + +@pytest.mark.usefixtures("file_read_write_test") +class TestGetReleases: + @patch("darwin.backend_v2.BackendV2.get_exports") + def test_returns_unavailable_releases_when_retry_is_true( + self, mock_get_exports, remote_dataset, releases_api_response + ): + mock_get_exports.return_value = releases_api_response + releases = remote_dataset.get_releases(retry=True) + assert len(releases) == 2 + assert isinstance(releases[0], Release) + assert isinstance(releases[1], Release) + + @patch("darwin.backend_v2.BackendV2.get_exports") + def test_omits_unavailable_releases_when_retry_is_false( + self, mock_get_exports, remote_dataset, releases_api_response + ): + mock_get_exports.return_value = releases_api_response + releases = remote_dataset.get_releases(retry=False) + assert len(releases) == 1 + assert isinstance(releases[0], Release) diff --git a/tests/fixtures.py b/tests/fixtures.py index 37059da55..87ee2029a 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -1,11 +1,13 @@ import shutil +from datetime import datetime from pathlib import Path -from typing import Generator +from typing import Generator, List from zipfile import ZipFile import pytest from darwin.config import Config +from darwin.dataset.release import Release @pytest.fixture @@ -118,3 +120,50 @@ def local_config_file( shutil.rmtree(darwin_path) if backup_darwin_path.exists(): shutil.move(backup_darwin_path, darwin_path) + + +@pytest.fixture +def pending_release() -> Release: + return Release( + dataset_slug="dataset_slug", + team_slug="team_slug", + version="1", + name="name", + status="pending", + url=None, + export_date=datetime.now(), + image_count=1, + class_count=1, + available=False, + latest=False, + format="json", + ) + + +@pytest.fixture +def releases_api_response() -> List[dict]: + return [ + { + "name": "release_1", + "status": "complete", + "version": 1, + "format": "darwin_json_2", + "metadata": { + "num_images": 1, + "annotation_classes": [{"id": 1, "name": "test_class"}], + }, + "inserted_at": "2024-06-28T12:29:02Z", + "latest": True, + "download_url": "download_url", + }, + { + "name": "release_2", + "status": "pending", + "version": 2, + "format": "darwin_json_2", + "metadata": {}, + "inserted_at": "2024-06-28T12:32:33Z", + "latest": False, + "download_url": None, + }, + ] From 39090fa1b9bae8e75c8b56b544ac1409d35b26c9 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Fri, 28 Jun 2024 15:54:53 +0100 Subject: [PATCH 08/13] Corrected docstrong --- darwin/dataset/remote_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index f956c59f2..2f6868c0c 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -282,7 +282,7 @@ def pull( ) time.sleep(retry_interval) retry_duration -= retry_interval - release = self.get_release(release.name) + release = self.get_release(release.name, retry=retry) if release.status == "pending": raise ValueError( f"Release {release.name} for dataset '{self.name}' is still processing after {retry_interval} seconds. Please try again later." @@ -772,7 +772,7 @@ def get_release(self, name: str = "latest", retry: bool = True) -> "Release": ---------- name : str, default: "latest" Name of the export. - retry : bool, default: False + retry : bool, default: True If True, return all releases, including those that are not available. Returns From 007a6564a4236ae0e4f8c70571304f36f1835823 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Thu, 4 Jul 2024 17:41:24 +0100 Subject: [PATCH 09/13] Feedback --- darwin/cli.py | 2 + darwin/cli_functions.py | 14 ++++--- darwin/dataset/release.py | 13 ++++-- darwin/dataset/remote_dataset.py | 44 ++++++++++++--------- darwin/dataset/remote_dataset_v2.py | 25 ++++++------ darwin/options.py | 15 ++++++- tests/darwin/dataset/release_test.py | 7 ++-- tests/darwin/dataset/remote_dataset_test.py | 18 ++++----- 8 files changed, 84 insertions(+), 54 deletions(-) diff --git a/darwin/cli.py b/darwin/cli.py index ce5b549cb..50b628c02 100644 --- a/darwin/cli.py +++ b/darwin/cli.py @@ -156,6 +156,8 @@ def _run(args: Namespace, parser: ArgumentParser) -> None: args.force_slots, args.ignore_slots, args.retry, + args.retry_timeout, + args.retry_interval, ) elif args.action == "import": f.dataset_import( diff --git a/darwin/cli_functions.py b/darwin/cli_functions.py index 0b136ea57..199d6f4fe 100644 --- a/darwin/cli_functions.py +++ b/darwin/cli_functions.py @@ -411,6 +411,8 @@ def pull_dataset( force_slots: bool = False, ignore_slots: bool = False, retry: bool = False, + retry_timeout: int = 600, + retry_interval: int = 10, ) -> None: """ Downloads a remote dataset (images and annotations) in the datasets directory. @@ -430,13 +432,13 @@ def pull_dataset( force_slots: bool Pulls all slots of items into deeper file structure ({prefix}/{item_name}/{slot_name}/{file_name}) retry: bool - If True, will repeatedly try to download the release if it is still processing up to a maximum of 5 minutes. + If True, will repeatedly try to download the release if it is still processing until the timeout is reached. + retry_timeout: int + If retrying, total time to wait for the release to be ready for download + retry_interval: int + If retrying, time to wait between retries of checking if the release is ready for download. """ version: str = DatasetIdentifier.parse(dataset_slug).version or "latest" - if version == "latest" and retry: - raise ValueError( - "To retry downloading a release, a release name must be provided. This can be done as follows:\n\ndarwin dataset pull team-slug/dataset-slug:release-name" - ) client: Client = _load_client(offline=False, maybe_guest=True) try: dataset: RemoteDataset = client.get_remote_dataset( @@ -460,6 +462,8 @@ def pull_dataset( force_slots=force_slots, ignore_slots=ignore_slots, retry=retry, + retry_timeout=retry_timeout, + retry_interval=retry_interval, ) print_new_version_info(client) except NotFound: diff --git a/darwin/dataset/release.py b/darwin/dataset/release.py index ee3b8e763..364b80837 100644 --- a/darwin/dataset/release.py +++ b/darwin/dataset/release.py @@ -1,5 +1,6 @@ import datetime import shutil +from enum import Enum from pathlib import Path from typing import Any, Dict, Optional @@ -8,6 +9,12 @@ from darwin.dataset.identifier import DatasetIdentifier +class ReleaseStatus(Enum): + PENDING = "pending" + COMPLETE = "complete" + FAILED = "failed" + + class Release: """ Represents a release/export. Releases created this way can only contain items with 'completed' @@ -23,7 +30,7 @@ class Release: The version of the ``Release``. name : str The name of the ``Release``. - status : str + status : ReleaseStatus The status of the ``Release``. url : Optional[str] The full url used to download the ``Release``. @@ -50,7 +57,7 @@ class Release: The version of the ``Release``. name : str The name of the ``Release``. - status : str + status : ReleaseStatus The status of the ``Release``. url : Optional[str] The full url used to download the ``Release``. @@ -74,7 +81,7 @@ def __init__( team_slug: str, version: str, name: str, - status: str, + status: ReleaseStatus, url: Optional[str], export_date: datetime.datetime, image_count: Optional[int], diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 2f6868c0c..07bfe5879 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -209,6 +209,8 @@ def pull( force_slots: bool = False, ignore_slots: bool = False, retry: bool = False, + retry_timeout: int = 600, + retry_interval: int = 10, ) -> Tuple[Optional[Callable[[], Iterator[Any]]], int]: """ Downloads a remote dataset (images and annotations) to the datasets directory. @@ -261,31 +263,29 @@ def pull( console = self.console or Console() + if retry and retry_timeout < retry_interval: + raise ValueError( + f"The value of retry_timeout '{retry_timeout}' must be greater than or equal to the value of retry_interval '{retry_interval}'." + ) + if release is None: - if retry: - raise ValueError( - "To retry downloading a release, a release name must be provided. This can be done as follows:\n\nrelease = dataset.get_release(name='release_name')\ndataset.pull(release=release, retry=True)" - ) - else: - release = self.get_release(retry=retry) + release = self.get_release(include_unavailable=retry) if release.format != "json" and release.format != "darwin_json_2": raise UnsupportedExportFormat(release.format) if release.status == "pending": if retry: - retry_duration = 300 - retry_interval = 10 - while release.status == "pending" and retry_duration > 0: + while release.status == "pending" and retry_timeout > 0: console.print( - f"Release '{release.name}' for dataset '{self.name}' is still processing. Retrying in {retry_interval} seconds... {retry_duration} seconds left before timeout." + f"Release '{release.name}' for dataset '{self.name}' is still processing. Retrying in {retry_interval} seconds... {retry_timeout} seconds left before timeout." ) time.sleep(retry_interval) - retry_duration -= retry_interval - release = self.get_release(release.name, retry=retry) + retry_timeout -= retry_interval + release = self.get_release(release.name, include_unavailable=retry) if release.status == "pending": raise ValueError( - f"Release {release.name} for dataset '{self.name}' is still processing after {retry_interval} seconds. Please try again later." + f"Release {release.name} for dataset '{self.name}' is still processing. Please try again later." ) else: raise ValueError( @@ -749,13 +749,13 @@ def get_report(self, granularity: str = "day") -> str: """ @abstractmethod - def get_releases(self, retry: bool = False) -> List["Release"]: + def get_releases(self, include_unavailable: bool = False) -> List["Release"]: """ Get a sorted list of releases with the most recent first. Parameters ---------- - retry : bool, default: False + include_unavailable : bool, default: False If True, return all releases, including those that are not available. Returns @@ -764,7 +764,9 @@ def get_releases(self, retry: bool = False) -> List["Release"]: Returns a sorted list of available ``Release``\\s with the most recent first. """ - def get_release(self, name: str = "latest", retry: bool = True) -> "Release": + def get_release( + self, name: str = "latest", include_unavailable: bool = True + ) -> "Release": """ Get a specific ``Release`` for this ``RemoteDataset``. @@ -772,7 +774,7 @@ def get_release(self, name: str = "latest", retry: bool = True) -> "Release": ---------- name : str, default: "latest" Name of the export. - retry : bool, default: True + include_unavailable : bool, default: True If True, return all releases, including those that are not available. Returns @@ -785,7 +787,7 @@ def get_release(self, name: str = "latest", retry: bool = True) -> "Release": NotFound The selected ``Release`` does not exist. """ - releases = self.get_releases(retry) + releases = self.get_releases(include_unavailable=include_unavailable) if not releases: raise NotFound( str( @@ -797,7 +799,11 @@ def get_release(self, name: str = "latest", retry: bool = True) -> "Release": if self.release and name == "latest": name = self.release elif name == "latest": - return next((release for release in releases if release.latest)) + return ( + sorted(releases, key=lambda x: x.export_date, reverse=True)[0] + if include_unavailable + else next((release for release in releases if release.latest)) + ) for release in releases: if str(release.name) == name: diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py index abf101690..a8764e31b 100644 --- a/darwin/dataset/remote_dataset_v2.py +++ b/darwin/dataset/remote_dataset_v2.py @@ -115,13 +115,13 @@ def __init__( version=2, ) - def get_releases(self, retry: bool = False) -> List["Release"]: + def get_releases(self, include_unavailable: bool = False) -> List["Release"]: """ Get a sorted list of releases with the most recent first. Parameters ---------- - retry : bool, default: False + include_unavailable : bool, default: False If True, return all releases, including those that are not available. Returns @@ -141,18 +141,15 @@ def get_releases(self, retry: bool = False) -> List["Release"]: for payload in releases_json ] - if retry: - return sorted( - releases, - key=lambda x: x.version, - reverse=True, - ) - else: - return sorted( - filter(lambda x: x.available, releases), - key=lambda x: x.version, - reverse=True, - ) + return sorted( + ( + releases + if include_unavailable + else filter(lambda x: x.available, releases) + ), + key=lambda x: x.version, + reverse=True, + ) def push( self, diff --git a/darwin/options.py b/darwin/options.py index 62aa519fd..4efa8599c 100644 --- a/darwin/options.py +++ b/darwin/options.py @@ -280,7 +280,20 @@ def __init__(self) -> None: parser_pull.add_argument( "--retry", action="store_true", - help="Repeatedly try to download the release if it is still processing. Times out after 5 minutes.", + default=False, + help="Repeatedly try to download the release if it is still processing.", + ) + parser_pull.add_argument( + "--retry-timeout", + type=int, + default=600, + help="Total time to wait for the release to be ready for download.", + ) + parser_pull.add_argument( + "--retry-interval", + type=int, + default=10, + help="Time to wait between retries of checking if the release is ready for download.", ) slots_group = parser_pull.add_mutually_exclusive_group() slots_group.add_argument( diff --git a/tests/darwin/dataset/release_test.py b/tests/darwin/dataset/release_test.py index 790a84c3f..2ee86a84c 100644 --- a/tests/darwin/dataset/release_test.py +++ b/tests/darwin/dataset/release_test.py @@ -1,11 +1,12 @@ import shutil +from datetime import datetime from pathlib import Path from unittest.mock import patch import pytest import requests -from darwin.dataset.release import Release +from darwin.dataset.release import Release, ReleaseStatus from tests.fixtures import * @@ -16,9 +17,9 @@ def release(dataset_slug: str, team_slug_darwin_json_v2: str) -> Release: team_slug=team_slug_darwin_json_v2, version="latest", name="test", - status="test_status", + status=ReleaseStatus("pending"), url="http://test.v7labs.com/", - export_date="now", + export_date=datetime.fromisoformat("2021-01-01T00:00:00+00:00"), image_count=None, class_count=None, available=True, diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index a4d88adc9..05dcaf49f 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -813,18 +813,14 @@ def fake_download_zip(self, path): ) assert metadata_path.exists() - def test_pull_raises_value_error_when_retry_is_true_and_release_is_none( - self, remote_dataset - ): - with pytest.raises(ValueError): - remote_dataset.pull(release=None, retry=True) - @patch("time.sleep", return_value=None) def test_num_retries(self, mock_sleep, remote_dataset, pending_release): with patch.object(remote_dataset, "get_release", return_value=pending_release): with pytest.raises(ValueError): remote_dataset.pull(release=pending_release, retry=True) - assert mock_sleep.call_count == 30 # 300 seconds / 10 seconds interval + assert ( + mock_sleep.call_count == 60 + ) # Default values of 600 seconds / 10 seconds interval @patch("time.sleep", return_value=None) def test_raises_after_max_retry_duration( @@ -834,6 +830,10 @@ def test_raises_after_max_retry_duration( with pytest.raises(ValueError, match="is still processing after"): remote_dataset.pull(release=pending_release, retry=True) + def test_raises_error_if_timeout_less_than_interval(self, remote_dataset): + with pytest.raises(ValueError): + remote_dataset.pull(retry=True, retry_timeout=5, retry_interval=10) + class TestPullNamingConvention: def _test_pull_naming_convention( @@ -1351,7 +1351,7 @@ def test_returns_unavailable_releases_when_retry_is_true( self, mock_get_exports, remote_dataset, releases_api_response ): mock_get_exports.return_value = releases_api_response - releases = remote_dataset.get_releases(retry=True) + releases = remote_dataset.get_releases(include_unavailable=True) assert len(releases) == 2 assert isinstance(releases[0], Release) assert isinstance(releases[1], Release) @@ -1361,6 +1361,6 @@ def test_omits_unavailable_releases_when_retry_is_false( self, mock_get_exports, remote_dataset, releases_api_response ): mock_get_exports.return_value = releases_api_response - releases = remote_dataset.get_releases(retry=False) + releases = remote_dataset.get_releases(include_unavailable=False) assert len(releases) == 1 assert isinstance(releases[0], Release) From d600996f36f9b22fb6971fe03af9ff7ee5b8f7ca Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Thu, 4 Jul 2024 17:52:33 +0100 Subject: [PATCH 10/13] Fixed test --- tests/darwin/dataset/remote_dataset_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index 05dcaf49f..ac76baa98 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -827,7 +827,7 @@ def test_raises_after_max_retry_duration( self, mock_sleep, remote_dataset, pending_release ): with patch.object(remote_dataset, "get_release", return_value=pending_release): - with pytest.raises(ValueError, match="is still processing after"): + with pytest.raises(ValueError, match="is still processing"): remote_dataset.pull(release=pending_release, retry=True) def test_raises_error_if_timeout_less_than_interval(self, remote_dataset): From e5bc4b5de56a570438d839b73e5b03a381cd7859 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Thu, 4 Jul 2024 17:52:33 +0100 Subject: [PATCH 11/13] Fixed test --- tests/fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 87ee2029a..8cf866e81 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,7 +7,7 @@ import pytest from darwin.config import Config -from darwin.dataset.release import Release +from darwin.dataset.release import Release, ReleaseStatus @pytest.fixture @@ -129,7 +129,7 @@ def pending_release() -> Release: team_slug="team_slug", version="1", name="name", - status="pending", + status=ReleaseStatus("pending"), url=None, export_date=datetime.now(), image_count=1, From 2854a5538877a512bfeb3bbd9a7610f2fd6f8ad7 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Thu, 4 Jul 2024 17:52:33 +0100 Subject: [PATCH 12/13] Fixed tests --- darwin/dataset/release.py | 2 +- tests/darwin/dataset/remote_dataset_test.py | 12 ++++++------ tests/fixtures.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/darwin/dataset/release.py b/darwin/dataset/release.py index 364b80837..517ca91ef 100644 --- a/darwin/dataset/release.py +++ b/darwin/dataset/release.py @@ -94,7 +94,7 @@ def __init__( self.team_slug = team_slug self.version = version self.name = name - self.status = status + self.status = status.value self.url = url self.export_date = export_date self.image_count = image_count diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index ac76baa98..5073a0c8c 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -16,7 +16,7 @@ from darwin.config import Config from darwin.dataset import RemoteDataset from darwin.dataset.download_manager import _download_image_from_json_annotation -from darwin.dataset.release import Release +from darwin.dataset.release import Release, ReleaseStatus from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2 from darwin.dataset.upload_manager import LocalFile, UploadHandlerV2 from darwin.datatypes import ManifestItem, ObjectStore, SegmentManifest @@ -662,7 +662,7 @@ def test_gets_latest_release_when_not_given_one( "team-slug", "0.1.0", "release-name", - "release-status", + ReleaseStatus("complete"), "http://darwin-fake-url.com", datetime.now(), None, @@ -693,7 +693,7 @@ def test_does_not_create_symlink_on_windows( "team-slug", "0.1.0", "release-name", - "release-status", + ReleaseStatus("complete"), "http://darwin-fake-url.com", datetime.now(), None, @@ -726,7 +726,7 @@ def test_continues_if_symlink_creation_fails( "team-slug", "0.1.0", "release-name", - "release-status", + ReleaseStatus("complete"), "http://darwin-fake-url.com", datetime.now(), None, @@ -761,7 +761,7 @@ def test_raises_if_release_format_is_not_json( remote_dataset.team, "0.1.0", "release-name", - "release-status", + ReleaseStatus("complete"), "http://darwin-fake-url.com", datetime.now(), None, @@ -783,7 +783,7 @@ def test_moves_properties_metadata_file( "team-slug", "0.1.0", "release-name", - "release-status", + ReleaseStatus("complete"), "http://darwin-fake-url.com", datetime.now(), None, diff --git a/tests/fixtures.py b/tests/fixtures.py index 8cf866e81..045d98888 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -145,7 +145,7 @@ def releases_api_response() -> List[dict]: return [ { "name": "release_1", - "status": "complete", + "status": ReleaseStatus("complete"), "version": 1, "format": "darwin_json_2", "metadata": { @@ -158,7 +158,7 @@ def releases_api_response() -> List[dict]: }, { "name": "release_2", - "status": "pending", + "status": ReleaseStatus("pending"), "version": 2, "format": "darwin_json_2", "metadata": {}, From 241c527fb3ce57c5dc5821efd36996d24946558a Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Tue, 16 Jul 2024 14:47:19 +0100 Subject: [PATCH 13/13] Updated retry logic so that it worked with storing release statuses as enums --- darwin/cli_functions.py | 2 +- darwin/dataset/release.py | 2 +- darwin/dataset/remote_dataset.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/darwin/cli_functions.py b/darwin/cli_functions.py index 199d6f4fe..395c7c829 100644 --- a/darwin/cli_functions.py +++ b/darwin/cli_functions.py @@ -468,7 +468,7 @@ def pull_dataset( print_new_version_info(client) except NotFound: _error( - f"Version '{dataset.identifier}:{version}' does not exist " + f"Version '{dataset.identifier}:{version}' does not exist. " f"Use 'darwin dataset releases' to list all available versions." ) except UnsupportedExportFormat as uef: diff --git a/darwin/dataset/release.py b/darwin/dataset/release.py index 517ca91ef..54e32fbb7 100644 --- a/darwin/dataset/release.py +++ b/darwin/dataset/release.py @@ -94,7 +94,7 @@ def __init__( self.team_slug = team_slug self.version = version self.name = name - self.status = status.value + self.status = ReleaseStatus(status) self.url = url self.export_date = export_date self.image_count = image_count diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 07bfe5879..c96ef194b 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -274,16 +274,16 @@ def pull( if release.format != "json" and release.format != "darwin_json_2": raise UnsupportedExportFormat(release.format) - if release.status == "pending": + if release.status.value == "pending": if retry: - while release.status == "pending" and retry_timeout > 0: + while release.status.value == "pending" and retry_timeout > 0: console.print( f"Release '{release.name}' for dataset '{self.name}' is still processing. Retrying in {retry_interval} seconds... {retry_timeout} seconds left before timeout." ) time.sleep(retry_interval) retry_timeout -= retry_interval release = self.get_release(release.name, include_unavailable=retry) - if release.status == "pending": + if release.status.value == "pending": raise ValueError( f"Release {release.name} for dataset '{self.name}' is still processing. Please try again later." )