diff --git a/mfr/providers/osf/provider.py b/mfr/providers/osf/provider.py index cb7db783..dce2e56b 100644 --- a/mfr/providers/osf/provider.py +++ b/mfr/providers/osf/provider.py @@ -1,4 +1,5 @@ import os +import re import json import hashlib import logging @@ -159,11 +160,35 @@ async def download(self): return streams.ResponseStreamReader(response) + @staticmethod + def _is_osf_download_route(url): + """True when ``url`` is OSF's persistent file download path ``/download//``.""" + path = urlparse(url).path or '' + osf_download_path_re = re.compile(r'^/download/[^/]+', re.IGNORECASE) + return bool(osf_download_path_re.match(path)) + + async def _fetch_redirect(self, url, headers): + request = await self._make_request( + 'GET', + url, + allow_redirects=False, + headers=headers, + ) + status = request.status + location = request.headers.get('Location') or request.headers.get('location') + reason = request.reason + await request.release() + return location, status, reason + async def _fetch_download_url(self): """Provider needs a WaterButler URL to download and get metadata. If ``url`` is already a WaterButler url, return that. If not, then the url points to an OSF endpoint that will redirect to WB. Issue a GET request against it, then return the WB url stored in the Location header. + + First ``GET`` may return **301** with ``Location`` for legacy download route. If ``Location`` + points at OSF's ``/download//`` route, perform a second ``GET`` there and expect **302** + to WaterButler. If ``Location`` is not that route (e.g. already WaterButler), use it as ``download_url``. """ if not self.download_url: # v1 Waterbutler url provided @@ -173,26 +198,32 @@ async def _fetch_download_url(self): self.metrics.add('download_url.orig_type', 'wb_v1') else: self.metrics.add('download_url.orig_type', 'osf') - # make request to osf, don't follow, store waterbutler download url - request = await self._make_request( - 'GET', - self.url, - allow_redirects=False, - headers={ - 'Content-Type': 'application/json' - } - ) - await request.release() + # make request to osf, store waterbutler download url + headers = {'Content-Type': 'application/json'} - logger.debug(f'osf-download-resolver: request.status::{request.status}') - if request.status != 302: + location, status, reason = await self._fetch_redirect(self.url, headers) + logger.debug(f'osf-download-resolver: url={self.url} status={status} location={location}') + if status not in (301, 302) or not location: raise exceptions.MetadataError( - request.reason, + reason, metadata_url=self.url, provider=self.NAME, - code=request.status, + code=status, ) - self.download_url = request.headers['location'] + + if self._is_osf_download_route(location): + location, status, reason = await self._fetch_redirect(location, headers) + + logger.debug(f'osf-download-resolver follow /download/: url={location} status={status} location={location}') + if status != 302 or not location: + raise exceptions.MetadataError( + reason, + metadata_url=location, + provider=self.NAME, + code=status, + ) + + self.download_url = location self.metrics.add('download_url.derived_url', str(self.download_url))