Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 46 additions & 15 deletions mfr/providers/osf/provider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
import json
import hashlib
import logging
Expand Down Expand Up @@ -159,11 +160,35 @@ async def download(self):

return streams.ResponseStreamReader(response)

@staticmethod
def _is_osf_download_route(url):
"""True when ``url`` is OSF's persistent file download path ``/download/<id>/``."""
path = urlparse(url).path or ''
osf_download_path_re = re.compile(r'^/download/[^/]+', re.IGNORECASE)
return bool(osf_download_path_re.match(path))

async def _fetch_redirect(self, url, headers):
request = await self._make_request(
'GET',
url,
allow_redirects=False,
headers=headers,
)
status = request.status
location = request.headers.get('Location') or request.headers.get('location')
reason = request.reason
await request.release()
return location, status, reason

async def _fetch_download_url(self):
"""Provider needs a WaterButler URL to download and get metadata. If ``url`` is already
a WaterButler url, return that. If not, then the url points to an OSF endpoint that will
redirect to WB. Issue a GET request against it, then return the WB url stored in the
Location header.

First ``GET`` may return **301** with ``Location`` for legacy download route. If ``Location``
points at OSF's ``/download/<id>/`` route, perform a second ``GET`` there and expect **302**
to WaterButler. If ``Location`` is not that route (e.g. already WaterButler), use it as ``download_url``.
"""
if not self.download_url:
# v1 Waterbutler url provided
Expand All @@ -173,26 +198,32 @@ async def _fetch_download_url(self):
self.metrics.add('download_url.orig_type', 'wb_v1')
else:
self.metrics.add('download_url.orig_type', 'osf')
# make request to osf, don't follow, store waterbutler download url
request = await self._make_request(
'GET',
self.url,
allow_redirects=False,
headers={
'Content-Type': 'application/json'
}
)
await request.release()
# make request to osf, store waterbutler download url
headers = {'Content-Type': 'application/json'}

logger.debug(f'osf-download-resolver: request.status::{request.status}')
if request.status != 302:
location, status, reason = await self._fetch_redirect(self.url, headers)
logger.debug(f'osf-download-resolver: url={self.url} status={status} location={location}')
if status not in (301, 302) or not location:
raise exceptions.MetadataError(
request.reason,
reason,
metadata_url=self.url,
provider=self.NAME,
code=request.status,
code=status,
)
self.download_url = request.headers['location']

if self._is_osf_download_route(location):
location, status, reason = await self._fetch_redirect(location, headers)

logger.debug(f'osf-download-resolver follow /download/: url={location} status={status} location={location}')
if status != 302 or not location:
raise exceptions.MetadataError(
reason,
metadata_url=location,
provider=self.NAME,
code=status,
)

self.download_url = location

self.metrics.add('download_url.derived_url', str(self.download_url))

Expand Down
Loading