Skip to content

Commit

Permalink
Issue #83: Use Asyncio (retries).
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 11, 2023
1 parent ef98f5e commit 9da7d93
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 7 deletions.
2 changes: 2 additions & 0 deletions dirhunt/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class ConfigurationDict(TypedDict):
user_agent: Optional[str]
cookies: Dict[str, str]
headers: Dict[str, str]
retries: Optional[int]


@dataclass
Expand Down Expand Up @@ -59,3 +60,4 @@ class Configuration:
user_agent: Optional[str] = None
cookies: Dict[str, str] = field(default_factory=dict)
headers: Dict[str, str] = field(default_factory=dict)
retries: Optional[int] = None
19 changes: 13 additions & 6 deletions dirhunt/crawler_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
"not_found.fake": 3,
"html": 2,
}
URL_TYPES = Literal["index_file",] # index.php, index.html, index.htm, etc.
URL_TYPES = Literal["index_file"] # index.php, index.html, index.htm, etc.
DEFAULT_ENCODING = "utf-8"
RETRIES_WAIT = 2


if TYPE_CHECKING:
Expand Down Expand Up @@ -51,7 +52,9 @@ def __init__(self, crawler_url: "CrawlerUrl"):
self.crawler_url = crawler_url
self.crawler = crawler_url.crawler

async def retrieve(self) -> Optional["ProcessBase"]:
async def retrieve(self, retries: Optional[int] = None) -> Optional["ProcessBase"]:
if retries is None:
retries = self.crawler.configuration.retries
from dirhunt.processors import (
get_processor,
)
Expand All @@ -74,10 +77,14 @@ async def retrieve(self) -> Optional["ProcessBase"]:
if processor.has_descendants:
processor = get_processor(self)
except (ClientError, asyncio.TimeoutError) as e:
self.crawler.current_processed_count += 1
self.crawler.print_error(
f"Request error to {self.crawler_url.url}: {get_message_from_exception(e)}"
)
if retries and retries > 0:
await asyncio.sleep(RETRIES_WAIT)
return await self.retrieve(retries - 1)
else:
self.crawler.current_processed_count += 1
self.crawler.print_error(
f"Request error to {self.crawler_url.url}: {get_message_from_exception(e)}"
)
else:
await processor.process(self)
finally:
Expand Down
6 changes: 5 additions & 1 deletion dirhunt/management.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,13 +248,17 @@ def flags_range(flags):
help="Add a cookie to requests in the cookie_name:value format.",
)
@click.option(
"-h",
"--header",
"headers",
callback=key_value,
multiple=True,
help="Add a header to requests in the header:value format.",
)
@click.option(
"--retries",
type=int,
help="Retry errors the indicated number of times.",
)
@click.option(
"--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True
)
Expand Down

0 comments on commit 9da7d93

Please sign in to comment.