From 91a370e50cc0cf2ccae74b08fe0c0f9fd36a8f8b Mon Sep 17 00:00:00 2001
From: Matt Jeff <jefcolbi@gmail.com>
Date: Tue, 1 Oct 2024 15:18:15 +0100
Subject: [PATCH] add timeout argument

---
 scidownl/api/scihub.py      | 6 ++++--
 scidownl/core/crawler.py    | 3 ++-
 scidownl/core/downloader.py | 3 ++-
 scidownl/core/task.py       | 5 ++++-
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/scidownl/api/scihub.py b/scidownl/api/scihub.py
index 40de6f0..0f8c17e 100644
--- a/scidownl/api/scihub.py
+++ b/scidownl/api/scihub.py
@@ -9,7 +9,8 @@ def scihub_download(
         paper_type: str = 'doi',
         scihub_url: str = None,
         out: str = None,
-        proxies: dict = None
+        proxies: dict = None,
+        timeout: int = None,
     ) -> None:
     """Download a paper from SciHub.
 
@@ -33,6 +34,7 @@ def scihub_download(
         source_type=paper_type,
         scihub_url=scihub_url,
         out=out,
-        proxies=proxies
+        proxies=proxies,
+        timeout=timeout
     ).run()
 
diff --git a/scidownl/core/crawler.py b/scidownl/core/crawler.py
index b503a12..21f3186 100644
--- a/scidownl/core/crawler.py
+++ b/scidownl/core/crawler.py
@@ -34,9 +34,10 @@ def crawl(self) -> HtmlContent:
                 'request': self.source[self.source.type]
             }
             proxies = self.task.context.get('proxies', {}) if self.task is not None else {}
+            timeout = self.task.context.get('timeout', None) if self.task is not None else None
             logger.info(f"<- Request: scihub_url={self.scihub_url}, source={self.source}, proxies={proxies}")
 
-            res = self.sess.post(self.scihub_url, data=request_params, proxies=proxies)
+            res = self.sess.post(self.scihub_url, data=request_params, proxies=proxies, timeout=timeout)
             logger.info(f"-> Response: status_code={res.status_code}, content_length={len(res.content.decode())}")
 
             if res.status_code not in ScihubCrawler.OK_STATUS_CODES:
diff --git a/scidownl/core/downloader.py b/scidownl/core/downloader.py
index 5956ccd..1cb8421 100644
--- a/scidownl/core/downloader.py
+++ b/scidownl/core/downloader.py
@@ -31,7 +31,8 @@ def download(self, out: str) -> str:
         try:
             url = self.information.get_url()
             proxies = self.task.context.get('proxies', {}) if self.task is not None else {}
-            res = requests.get(url, stream=True, proxies=proxies)
+            timeout = self.task.context.get('timeout', None) if self.task is not None else None
+            res = requests.get(url, stream=True, proxies=proxies, timeout=timeout)
             total_length = res.headers.get('content-length')
 
             with open(out, "wb") as f:
diff --git a/scidownl/core/task.py b/scidownl/core/task.py
index 7e13b12..0387170 100644
--- a/scidownl/core/task.py
+++ b/scidownl/core/task.py
@@ -28,7 +28,8 @@ def __init__(self,
                  scihub_url: str = None,
                  scihub_url_chooser_cls=default_chooser_cls,
                  out: str = None,
-                 proxies: dict = None):
+                 proxies: dict = None,
+                 timeout: int = None):
         super().__init__()
         self.source_keyword = source_keyword
         self.scihub_url_chooser_cls = scihub_url_chooser_cls
@@ -37,8 +38,10 @@ def __init__(self,
         self.source_class = source_classes.get(source_type, DoiSource)
         self.out = out
         self.proxies = proxies or {}
+        self.timeout = timeout
         self.context['status'] = 'initialized'
         self.context['proxies'] = self.proxies
+        self.context['timeout'] = timeout
         self.service = ScihubUrlService()
         self.updater = CrawlingScihubDomainUpdater()