Skip to content

Commit

Permalink
Merge pull request #164 from EGA-archive/EE-1405
Browse files Browse the repository at this point in the history
EE-1405 Python client support for new Data API
  • Loading branch information
afoix authored Aug 16, 2022
2 parents 93df9df + 076c78c commit 35c7093
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 45 deletions.
3 changes: 2 additions & 1 deletion pyega3/libs/data_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,14 @@ def create_session_with_retry(retry_policy: retry.Retry = None, pool_max_size=No

class DataClient:

def __init__(self, data_url, htsget_url, auth_client, standard_headers, connections=None, metadata_url=None):
def __init__(self, data_url, htsget_url, auth_client, standard_headers, connections=None, metadata_url=None, api_version=1):
self.url = data_url
self.metadata_url = metadata_url if metadata_url is not None else data_url + "/metadata"
self.htsget_url = htsget_url
self.auth_client = auth_client
self.standard_headers = standard_headers
self.session = create_session_with_retry(pool_max_size=connections)
self.api_version = api_version

@staticmethod
def print_debug_info(url, reply_json, *args):
Expand Down
34 changes: 27 additions & 7 deletions pyega3/libs/data_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,38 @@ def __init__(self, data_client, file_id,
self._unencrypted_checksum = unencrypted_checksum
self._file_status = status

@staticmethod
def from_metadata(data_client, metadata):
file_id = metadata['fileId']
result = DataFile(data_client, file_id)
result._set_metadata_from_json(metadata)
return result

def load_metadata(self):
res = self.data_client.get_json(f"/files/{self.id}")

# If the user does not have access to the file then the server returns HTTP code 200 but the JSON payload has
# all the fields empty
if res['displayFileName'] is None or res['unencryptedChecksum'] is None:
if self.data_client.api_version < 2 and (res['displayFileName'] is None or res['unencryptedChecksum'] is None):
raise RuntimeError(f"Metadata for file id '{self.id}' could not be retrieved. " +
"This is probably because your account does not have access to this file. "
"You can check which datasets your account has access to at "
"'https://ega-archive.org/my-datasets.php' after logging in.")

self._display_file_name = res['displayFileName']
self._file_name = res['fileName']
self._file_size = res['fileSize']
self._unencrypted_checksum = res['unencryptedChecksum']
self._file_status = res['fileStatus']
self._set_metadata_from_json(res)

def _set_metadata_from_json(self, res):
self._display_file_name = res['displayFileName'] if 'displayFileName' in res else None
self._file_name = res['fileName'] if 'fileName' in res else None
self._file_size = res['fileSize'] if 'fileSize' in res else None

if self.data_client.api_version == 1:
self._unencrypted_checksum = res['unencryptedChecksum'] if 'unencryptedChecksum' in res else None
self._file_status = res['fileStatus'] if 'fileStatus' in res else None

elif self.data_client.api_version == 2:
self._unencrypted_checksum = res['plainChecksum'] if 'plainChecksum' in res else None
self._file_status = "unknown" # API does not currently include file status

@property
def display_name(self):
Expand Down Expand Up @@ -284,9 +300,13 @@ def download_file_retry(self, num_connections, output_dir, genomic_range_args, m
f"location")

if DataFile.is_genomic_range(genomic_range_args):
if self.data_client.api_version == 1:
endpoint_type = "files"
else:
endpoint_type = "reads" if self.name.endswith(".bam") or self.name.endswith(".cram") else "variants"
with open(output_file, 'wb') as output:
htsget.get(
f"{self.data_client.htsget_url}/files/{self.id}",
f"{self.data_client.htsget_url}/{endpoint_type}/{self.id}",
output,
reference_name=genomic_range_args[0], reference_md5=genomic_range_args[1],
start=genomic_range_args[2], end=genomic_range_args[3],
Expand Down
24 changes: 8 additions & 16 deletions pyega3/libs/data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ def list_authorized_datasets(data_client):
" If you believe you should have access please contact helpdesk on [email protected]")
sys.exit()

return [DataSet(data_client, dataset_id) for dataset_id in reply]
if data_client.api_version == 1:
return [DataSet(data_client, dataset_id) for dataset_id in reply]

return [DataSet(data_client, dataset['datasetId']) for dataset in reply]

def list_files(self):
if self.id in LEGACY_DATASETS:
Expand All @@ -42,8 +45,9 @@ def list_files(self):

authorized_datasets = DataSet.list_authorized_datasets(self.data_client)

if self.id not in [dataset.id for dataset in authorized_datasets]:
logging.error(f"Dataset '{self.id}' is not in the list of your authorized datasets.")
authorized_dataset_ids = [dataset.id for dataset in authorized_datasets]
if self.id not in authorized_dataset_ids:
logging.error(f"Dataset '{self.id}' is not in the list of your authorized datasets ({authorized_dataset_ids})")
sys.exit()

reply = self.data_client.get_json(f"/datasets/{self.id}/files")
Expand All @@ -52,19 +56,7 @@ def list_files(self):
logging.error(f"List files in dataset {self.id} failed")
sys.exit()

def make_data_file(res):
display_file_name = res['displayFileName'] if 'displayFileName' in res else None
file_name = res['fileName'] if 'fileName' in res else None
size = res['fileSize'] if 'fileSize' in res else None
unencrypted_checksum = res['unencryptedChecksum'] if 'unencryptedChecksum' in res else None
return data_file.DataFile(self.data_client, res['fileId'],
display_file_name=display_file_name,
file_name=file_name,
size=size,
unencrypted_checksum=unencrypted_checksum,
status=res['fileStatus'])

return [make_data_file(res) for res in reply]
return [data_file.DataFile.from_metadata(self.data_client, res) for res in reply]

def download(self, num_connections, output_dir, genomic_range_args, max_retries=5, retry_wait=5,
max_slice_size=DataFile.DEFAULT_SLICE_SIZE):
Expand Down
8 changes: 6 additions & 2 deletions pyega3/libs/server_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ class ServerConfig:
url_api_ticket = None
client_secret = None

def __init__(self, url_api, url_auth, url_api_metadata, url_api_ticket, client_secret):
def __init__(self, api_version, url_api, url_auth, url_api_metadata, url_api_ticket, client_secret):
self.api_version = api_version
self.url_api = url_api
self.url_auth = url_auth
self.url_api_metadata = url_api_metadata
Expand Down Expand Up @@ -39,13 +40,16 @@ def check_key(key):
logging.error(f"{filepath} does not contain '{key}' field")
sys.exit()

api_version = 1 if 'api_version' not in custom_server_config else custom_server_config['api_version']

check_key('url_auth')
check_key('url_api')
check_key('url_api_ticket')
check_key('client_secret')
# Do not check url_api_metadata, it is optional

return ServerConfig(custom_server_config['url_api'],
return ServerConfig(api_version,
custom_server_config['url_api'],
custom_server_config['url_auth'],
custom_server_config['url_api_metadata'] if 'url_api_metadata' in custom_server_config else None,
custom_server_config['url_api_ticket'],
Expand Down
3 changes: 2 additions & 1 deletion pyega3/pyega3.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@ def main():
auth_client.credentials = credentials

data_client = DataClient(server_config.url_api, server_config.url_api_ticket, auth_client, standard_headers,
connections=args.connections, metadata_url=server_config.url_api_metadata)
connections=args.connections, metadata_url=server_config.url_api_metadata,
api_version=server_config.api_version)

execute_subcommand(args, data_client)

Expand Down
3 changes: 2 additions & 1 deletion tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def rand_str():

@pytest.fixture
def mock_server_config():
return ServerConfig(url_api='https://test.data.server',
return ServerConfig(api_version=1,
url_api='https://test.data.server',
url_auth='https://test.auth.server/ega-openid-connect-server/token',
url_api_metadata=None,
url_api_ticket='https://test.ticket.server',
Expand Down
25 changes: 8 additions & 17 deletions tests/unit/test_download_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pyega3.libs.data_set import DataSet


@mock.patch("pyega3.libs.data_file.DataFile")
@mock.patch("pyega3.libs.data_file.DataFile.from_metadata")
def test_calls_download_for_every_file_in_dataset(mocked_datafile, mock_data_server, dataset_with_files,
mock_server_config,
mock_auth_client, mock_data_client):
Expand All @@ -22,23 +22,18 @@ def test_calls_download_for_every_file_in_dataset(mocked_datafile, mock_data_ser
dataset.download(num_connections, None, None, 5, 5)

assert len(dataset_with_files.files) == mocked_datafile.call_count
mocked_datafile.assert_has_calls(
[mock.call(mock_data_client,
f['fileId'],
display_file_name=f['displayFileName'],
file_name=f['fileName'],
size=f['fileSize'],
unencrypted_checksum=f['unencryptedChecksum'],
status=f['fileStatus'])
for f in dataset_with_files.files])
mocked_datafile.assert_has_calls([mock.call(mock_data_client,
{k: v for k, v in f.items() if k not in ['fileContent']})
for f in dataset_with_files.files],
any_order=True)

for mock_file in mock_files:
assert len(mock_file.method_calls) == 1
assert mock_file.method_calls[0] == ('download_file_retry',
(num_connections, None, None, 5, 5, DataFile.DEFAULT_SLICE_SIZE))


@mock.patch("pyega3.libs.data_file.DataFile")
@mock.patch("pyega3.libs.data_file.DataFile.from_metadata")
def test_only_download_available_files(mocked_datafile, mock_server_config, mock_data_server, dataset_with_files,
mock_auth_client, mock_data_client):
num_connections = 5
Expand All @@ -59,12 +54,8 @@ def test_only_download_available_files(mocked_datafile, mock_server_config, mock

assert len(dataset_with_files.files) == mocked_datafile.call_count
mocked_datafile.assert_has_calls(
[mock.call(mock_data_client, f['fileId'],
display_file_name=f.get('displayFileName'),
file_name=f.get('fileName'),
size=f.get('fileSize'),
unencrypted_checksum=f.get('unencryptedChecksum'),
status=f.get('fileStatus')) for f in dataset_with_files.files])
[mock.call(mock_data_client, {k: v for k, v in f.items() if k not in ['fileContent']})
for f in dataset_with_files.files])

# The first file was not available so it should not have been called
assert len(mock_files[0].method_calls) == 0
Expand Down
21 changes: 21 additions & 0 deletions tests/unit/test_server_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,24 @@ def test_load_server_config_missing_attributes_in_json_file(mock_input_file):
with mock_input_file(json.dumps(config)) as server_config_file:
with pytest.raises(SystemExit):
ServerConfig.from_file(server_config_file)

def test_load_server_config_no_api_version(mock_input_file):
config = {"url_auth": "http://url_auth",
"url_api": "http://url_api",
"client_secret": "secret",
"url_api_ticket":"http://url_api_ticket"}

with mock_input_file(json.dumps(config)) as server_config_file:
configObject = ServerConfig.from_file(server_config_file)
assert configObject.api_version == 1

def test_load_server_config_with_api_version(mock_input_file):
config = {"api_version": 2,
"url_auth": "http://url_auth",
"url_api": "http://url_api",
"client_secret": "secret",
"url_api_ticket":"http://url_api_ticket"}

with mock_input_file(json.dumps(config)) as server_config_file:
configObject = ServerConfig.from_file(server_config_file)
assert configObject.api_version == 2

0 comments on commit 35c7093

Please sign in to comment.