-
Notifications
You must be signed in to change notification settings - Fork 40
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAR-2707][External] Allow repeated polling of pending export releases #876
base: master
Are you sure you want to change the base?
Changes from 9 commits
2874d97
d6dd5ab
483f75c
69890a2
98b603c
aca0280
2dea7c0
2c623d0
39090fa
007a656
d600996
e5bc4b5
2854a55
241c527
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
from typing import Any, Dict, Optional | ||
|
||
import requests | ||
|
||
from darwin.dataset.identifier import DatasetIdentifier | ||
|
||
|
||
|
@@ -22,6 +23,8 @@ class Release: | |
The version of the ``Release``. | ||
name : str | ||
The name of the ``Release``. | ||
status : str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unsure if it's common in darwin-py but this would be better as an |
||
The status of the ``Release``. | ||
url : Optional[str] | ||
The full url used to download the ``Release``. | ||
export_date : datetime.datetime | ||
|
@@ -47,6 +50,8 @@ class Release: | |
The version of the ``Release``. | ||
name : str | ||
The name of the ``Release``. | ||
status : str | ||
The status of the ``Release``. | ||
url : Optional[str] | ||
The full url used to download the ``Release``. | ||
export_date : datetime.datetime | ||
|
@@ -69,6 +74,7 @@ def __init__( | |
team_slug: str, | ||
version: str, | ||
name: str, | ||
status: str, | ||
url: Optional[str], | ||
export_date: datetime.datetime, | ||
image_count: Optional[int], | ||
|
@@ -81,6 +87,7 @@ def __init__( | |
self.team_slug = team_slug | ||
self.version = version | ||
self.name = name | ||
self.status = status | ||
self.url = url | ||
self.export_date = export_date | ||
self.image_count = image_count | ||
|
@@ -155,6 +162,7 @@ def parse_json( | |
team_slug=team_slug, | ||
version=payload["version"], | ||
name=payload["name"], | ||
status=payload["status"], | ||
export_date=export_date, | ||
url=None, | ||
available=False, | ||
|
@@ -169,6 +177,7 @@ def parse_json( | |
team_slug=team_slug, | ||
version=payload["version"], | ||
name=payload["name"], | ||
status=payload["status"], | ||
image_count=payload["metadata"]["num_images"], | ||
class_count=len(payload["metadata"]["annotation_classes"]), | ||
export_date=export_date, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
import os | ||
import shutil | ||
import tempfile | ||
import time | ||
import zipfile | ||
from datetime import datetime | ||
from pathlib import Path | ||
|
@@ -207,6 +208,7 @@ def pull( | |
video_frames: bool = False, | ||
force_slots: bool = False, | ||
ignore_slots: bool = False, | ||
retry: bool = False, | ||
) -> Tuple[Optional[Callable[[], Iterator[Any]]], int]: | ||
""" | ||
Downloads a remote dataset (images and annotations) to the datasets directory. | ||
|
@@ -237,6 +239,8 @@ def pull( | |
Pulls video frames images instead of video files. | ||
force_slots: bool | ||
Pulls all slots of items into deeper file structure ({prefix}/{item_name}/{slot_name}/{file_name}) | ||
retry: bool | ||
If True, will repeatedly try to download the release if it is still processing up to a maximum of 5 minutes. | ||
|
||
Returns | ||
------- | ||
|
@@ -251,16 +255,46 @@ def pull( | |
If the given ``release`` has an invalid format. | ||
ValueError | ||
If darwin in unable to get ``Team`` configuration. | ||
ValueError | ||
If the release is still processing after the maximum retry duration. | ||
""" | ||
|
||
console = self.console or Console() | ||
|
||
if release is None: | ||
release = self.get_release() | ||
if retry: | ||
raise ValueError( | ||
"To retry downloading a release, a release name must be provided. This can be done as follows:\n\nrelease = dataset.get_release(name='release_name')\ndataset.pull(release=release, retry=True)" | ||
) | ||
else: | ||
release = self.get_release(retry=retry) | ||
|
||
if release.format != "json" and release.format != "darwin_json_2": | ||
raise UnsupportedExportFormat(release.format) | ||
|
||
if release.status == "pending": | ||
if retry: | ||
retry_duration = 300 | ||
retry_interval = 10 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be more conventional to have these configurable via CLI or some SDK settings There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @balysv This makes sense. I can see 2 options, both of which involve building in some validation:
I'm leaning toward the additional arguments |
||
while release.status == "pending" and retry_duration > 0: | ||
console.print( | ||
f"Release '{release.name}' for dataset '{self.name}' is still processing. Retrying in {retry_interval} seconds... {retry_duration} seconds left before timeout." | ||
) | ||
time.sleep(retry_interval) | ||
retry_duration -= retry_interval | ||
release = self.get_release(release.name, retry=retry) | ||
if release.status == "pending": | ||
raise ValueError( | ||
f"Release {release.name} for dataset '{self.name}' is still processing after {retry_interval} seconds. Please try again later." | ||
) | ||
else: | ||
raise ValueError( | ||
f"Release '{release.name}' for dataset '{self.name}' is still processing. Please wait for it to be ready.\n\n If you would like to automatically retry, set the `retry` parameter to `True` with the SDK, or use the `--retry` flag with the CLI." | ||
) | ||
console.print( | ||
f"Release '{release.name}' for dataset '{self.name}' is ready for download. Starting download..." | ||
) | ||
|
||
release_dir = self.local_releases_path / release.name | ||
release_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
|
@@ -715,24 +749,31 @@ def get_report(self, granularity: str = "day") -> str: | |
""" | ||
|
||
@abstractmethod | ||
def get_releases(self) -> List["Release"]: | ||
def get_releases(self, retry: bool = False) -> List["Release"]: | ||
""" | ||
Get a sorted list of releases with the most recent first. | ||
|
||
Parameters | ||
---------- | ||
retry : bool, default: False | ||
If True, return all releases, including those that are not available. | ||
|
||
Returns | ||
------- | ||
List["Release"] | ||
Returns a sorted list of available ``Release``\\s with the most recent first. | ||
""" | ||
|
||
def get_release(self, name: str = "latest") -> "Release": | ||
def get_release(self, name: str = "latest", retry: bool = True) -> "Release": | ||
""" | ||
Get a specific ``Release`` for this ``RemoteDataset``. | ||
|
||
Parameters | ||
---------- | ||
name : str, default: "latest" | ||
Name of the export. | ||
retry : bool, default: True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the name of the argument |
||
If True, return all releases, including those that are not available. | ||
|
||
Returns | ||
------- | ||
|
@@ -744,9 +785,13 @@ def get_release(self, name: str = "latest") -> "Release": | |
NotFound | ||
The selected ``Release`` does not exist. | ||
""" | ||
releases = self.get_releases() | ||
releases = self.get_releases(retry) | ||
if not releases: | ||
raise NotFound(str(self.identifier)) | ||
raise NotFound( | ||
str( | ||
f"No releases found for dataset '{self.name}'. Please create an export of this dataset first." | ||
) | ||
) | ||
JBWilkie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# overwrite default name with stored dataset.release if supplied | ||
if self.release and name == "latest": | ||
|
@@ -759,7 +804,7 @@ def get_release(self, name: str = "latest") -> "Release": | |
return release | ||
raise NotFound( | ||
str( | ||
f"Release name {name} not found in dataset {self.name}. Please check this release exists for this dataset." | ||
f"Release name '{name}' not found in dataset '{self.name}'. Please check this release exists for this dataset." | ||
) | ||
) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -115,10 +115,15 @@ def __init__( | |
version=2, | ||
) | ||
|
||
def get_releases(self) -> List["Release"]: | ||
def get_releases(self, retry: bool = False) -> List["Release"]: | ||
""" | ||
Get a sorted list of releases with the most recent first. | ||
|
||
Parameters | ||
---------- | ||
retry : bool, default: False | ||
If True, return all releases, including those that are not available. | ||
|
||
Returns | ||
------- | ||
List["Release"] | ||
|
@@ -135,11 +140,19 @@ def get_releases(self) -> List["Release"]: | |
Release.parse_json(self.slug, self.team, payload) | ||
for payload in releases_json | ||
] | ||
return sorted( | ||
filter(lambda x: x.available, releases), | ||
key=lambda x: x.version, | ||
reverse=True, | ||
) | ||
|
||
if retry: | ||
return sorted( | ||
releases, | ||
key=lambda x: x.version, | ||
reverse=True, | ||
) | ||
else: | ||
return sorted( | ||
filter(lambda x: x.available, releases), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm sure you can have this nicer where only the first argument
|
||
key=lambda x: x.version, | ||
reverse=True, | ||
) | ||
|
||
def push( | ||
self, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ def release(dataset_slug: str, team_slug_darwin_json_v2: str) -> Release: | |
team_slug=team_slug_darwin_json_v2, | ||
version="latest", | ||
name="test", | ||
status="test_status", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For documentation purposes, it'd be best to use actual values of export statuses here instead of stubs as they are enums and not arbitrary strings. |
||
url="http://test.v7labs.com/", | ||
export_date="now", | ||
image_count=None, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Few questions here:
latest
hardcoded by us?latest
withretry=True
?latest
. This would ensure we refer to the same export even if a new export would be created in the meantimeThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes,
latest
is a reserved release name. If you try to create an export namedlatest
, the api responds with{"errors":{"name":["is reserved"]}}
We will return the latest available release
Actually yes, I think we can. This is because each release has an
export_date
of typedatetime.datetime
. This allows us to select the most recent release incaseretry
is passed asTrue
. I'll make this change now, thank you for flagging