From 91a370e50cc0cf2ccae74b08fe0c0f9fd36a8f8b Mon Sep 17 00:00:00 2001 From: Matt Jeff Date: Tue, 1 Oct 2024 15:18:15 +0100 Subject: [PATCH] add timeout argument --- scidownl/api/scihub.py | 6 ++++-- scidownl/core/crawler.py | 3 ++- scidownl/core/downloader.py | 3 ++- scidownl/core/task.py | 5 ++++- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/scidownl/api/scihub.py b/scidownl/api/scihub.py index 40de6f0..0f8c17e 100644 --- a/scidownl/api/scihub.py +++ b/scidownl/api/scihub.py @@ -9,7 +9,8 @@ def scihub_download( paper_type: str = 'doi', scihub_url: str = None, out: str = None, - proxies: dict = None + proxies: dict = None, + timeout: int = None, ) -> None: """Download a paper from SciHub. @@ -33,6 +34,7 @@ def scihub_download( source_type=paper_type, scihub_url=scihub_url, out=out, - proxies=proxies + proxies=proxies, + timeout=timeout ).run() diff --git a/scidownl/core/crawler.py b/scidownl/core/crawler.py index b503a12..21f3186 100644 --- a/scidownl/core/crawler.py +++ b/scidownl/core/crawler.py @@ -34,9 +34,10 @@ def crawl(self) -> HtmlContent: 'request': self.source[self.source.type] } proxies = self.task.context.get('proxies', {}) if self.task is not None else {} + timeout = self.task.context.get('timeout', None) if self.task is not None else None logger.info(f"<- Request: scihub_url={self.scihub_url}, source={self.source}, proxies={proxies}") - res = self.sess.post(self.scihub_url, data=request_params, proxies=proxies) + res = self.sess.post(self.scihub_url, data=request_params, proxies=proxies, timeout=timeout) logger.info(f"-> Response: status_code={res.status_code}, content_length={len(res.content.decode())}") if res.status_code not in ScihubCrawler.OK_STATUS_CODES: diff --git a/scidownl/core/downloader.py b/scidownl/core/downloader.py index 5956ccd..1cb8421 100644 --- a/scidownl/core/downloader.py +++ b/scidownl/core/downloader.py @@ -31,7 +31,8 @@ def download(self, out: str) -> str: try: url = self.information.get_url() proxies = self.task.context.get('proxies', {}) if self.task is not None else {} - res = requests.get(url, stream=True, proxies=proxies) + timeout = self.task.context.get('timeout', None) if self.task is not None else None + res = requests.get(url, stream=True, proxies=proxies, timeout=timeout) total_length = res.headers.get('content-length') with open(out, "wb") as f: diff --git a/scidownl/core/task.py b/scidownl/core/task.py index 7e13b12..0387170 100644 --- a/scidownl/core/task.py +++ b/scidownl/core/task.py @@ -28,7 +28,8 @@ def __init__(self, scihub_url: str = None, scihub_url_chooser_cls=default_chooser_cls, out: str = None, - proxies: dict = None): + proxies: dict = None, + timeout: int = None): super().__init__() self.source_keyword = source_keyword self.scihub_url_chooser_cls = scihub_url_chooser_cls @@ -37,8 +38,10 @@ def __init__(self, self.source_class = source_classes.get(source_type, DoiSource) self.out = out self.proxies = proxies or {} + self.timeout = timeout self.context['status'] = 'initialized' self.context['proxies'] = self.proxies + self.context['timeout'] = timeout self.service = ScihubUrlService() self.updater = CrawlingScihubDomainUpdater()