Issue #83: Use Asyncio (retries).

Nekmo · Aug 11, 2023 · 9da7d93 · 9da7d93
1 parent ef98f5e
commit 9da7d93
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 7 deletions.
diff --git a/dirhunt/configuration.py b/dirhunt/configuration.py
@@ -29,6 +29,7 @@ class ConfigurationDict(TypedDict):
     user_agent: Optional[str]
     cookies: Dict[str, str]
     headers: Dict[str, str]
+    retries: Optional[int]
 
 
 @dataclass
@@ -59,3 +60,4 @@ class Configuration:
     user_agent: Optional[str] = None
     cookies: Dict[str, str] = field(default_factory=dict)
     headers: Dict[str, str] = field(default_factory=dict)
+    retries: Optional[int] = None
diff --git a/dirhunt/crawler_url.py b/dirhunt/crawler_url.py
@@ -18,8 +18,9 @@
     "not_found.fake": 3,
     "html": 2,
 }
-URL_TYPES = Literal["index_file",]  # index.php, index.html, index.htm, etc.
+URL_TYPES = Literal["index_file"]  # index.php, index.html, index.htm, etc.
 DEFAULT_ENCODING = "utf-8"
+RETRIES_WAIT = 2
 
 
 if TYPE_CHECKING:
@@ -51,7 +52,9 @@ def __init__(self, crawler_url: "CrawlerUrl"):
         self.crawler_url = crawler_url
         self.crawler = crawler_url.crawler
 
-    async def retrieve(self) -> Optional["ProcessBase"]:
+    async def retrieve(self, retries: Optional[int] = None) -> Optional["ProcessBase"]:
+        if retries is None:
+            retries = self.crawler.configuration.retries
         from dirhunt.processors import (
             get_processor,
         )
@@ -74,10 +77,14 @@ async def retrieve(self) -> Optional["ProcessBase"]:
                 if processor.has_descendants:
                     processor = get_processor(self)
         except (ClientError, asyncio.TimeoutError) as e:
-            self.crawler.current_processed_count += 1
-            self.crawler.print_error(
-                f"Request error to {self.crawler_url.url}: {get_message_from_exception(e)}"
-            )
+            if retries and retries > 0:
+                await asyncio.sleep(RETRIES_WAIT)
+                return await self.retrieve(retries - 1)
+            else:
+                self.crawler.current_processed_count += 1
+                self.crawler.print_error(
+                    f"Request error to {self.crawler_url.url}: {get_message_from_exception(e)}"
+                )
         else:
             await processor.process(self)
         finally:

diff --git a/dirhunt/management.py b/dirhunt/management.py
@@ -248,13 +248,17 @@ def flags_range(flags):
     help="Add a cookie to requests in the cookie_name:value format.",
 )
 @click.option(
-    "-h",
     "--header",
     "headers",
     callback=key_value,
     multiple=True,
     help="Add a header to requests in the header:value format.",
 )
+@click.option(
+    "--retries",
+    type=int,
+    help="Retry errors the indicated number of times.",
+)
 @click.option(
     "--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True
 )