Skip to content

Commit

Permalink
Issue #83: Use Asyncio
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 9, 2023
1 parent c5ba130 commit b5a2d14
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 120 deletions.
4 changes: 4 additions & 0 deletions dirhunt/colors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ def status_code_colors(status_code):
return "green3"
elif 300 <= status_code < 400:
return "deep_sky_blue1"
elif 400 <= status_code < 404 or 404 < status_code < 500:
return "deep_pink2"
elif 404 == status_code:
return "bright_red"
elif 500 == status_code:
return "magenta1"
else:
Expand Down
16 changes: 11 additions & 5 deletions dirhunt/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from dirhunt.json_report import JsonReportEncoder
from dirhunt.sessions import Sessions, Session
from dirhunt.sources import Sources
from dirhunt.url import Url
from dirhunt.url_info import UrlsInfo

"""Flags importance"""
Expand Down Expand Up @@ -82,19 +83,24 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop
async def start(self):
"""Add urls to process."""
for url in self.configuration.urls:
await self.add_crawler_url(
CrawlerUrl(self, url, depth=self.configuration.max_depth)
)
crawler_url = CrawlerUrl(self, url, depth=self.configuration.max_depth)
self.domains.add(crawler_url.url.domain)
await self.add_crawler_url(crawler_url)

while self.tasks:
await asyncio.wait(self.tasks)
await self.session.close()

async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Task]:
"""Add crawler_url to tasks"""
if crawler_url.url.url in self.crawler_urls:
if (
crawler_url in self.crawler_urls
or crawler_url.url.domain not in self.domains
):
return
self.crawler_urls.add(crawler_url)
task = self.loop.create_task(crawler_url.retrieve())
self.tasks.add(task)
self.crawler_urls.add(crawler_url)
task.add_done_callback(self.tasks.discard)
return task

Expand Down
75 changes: 38 additions & 37 deletions dirhunt/crawler_url.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,46 @@
# -*- coding: utf-8 -*-
import cgi
import socket
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any, Optional, Literal

from aiohttp import ClientResponse
from aiohttp.web_response import Response
from bs4 import BeautifulSoup
from requests import RequestException
from urllib3.exceptions import ReadTimeoutError
import charset_normalizer as chardet

from dirhunt.url import Url

RESPONSE_CHUNK = 1024 * 4
MAX_RESPONSE_SIZE = 1024 * 512
FLAGS_WEIGHT = {
"blank": 4,
"not_found.fake": 3,
"html": 2,
}
URL_TYPES = Literal["index_file",] # index.php, index.html, index.htm, etc.
DEFAULT_ENCODING = "utf-8"


if TYPE_CHECKING:
from dirhunt.crawler import Crawler
from dirhunt.processors import ProcessBase


async def get_content(response: "ClientResponse") -> str:
try:
encoding = response.get_encoding()
except RuntimeError:
# aiohttp can't detect encoding if the content is not available
encoding = None
data = b""
async for chunk in response.content.iter_chunked(RESPONSE_CHUNK):
data += chunk
if not chunk or len(data) >= MAX_RESPONSE_SIZE:
break
encoding = encoding or chardet.detect(data)["encoding"]
return data.decode(encoding or DEFAULT_ENCODING, errors="ignore")


class CrawlerUrlRequest:
response = Optional[Response]
content: Optional[str] = None
Expand All @@ -38,7 +56,6 @@ async def retrieve(self) -> "ProcessBase":
Error,
)

text = ""
try:
await self.crawler.domain_semaphore.acquire(self.crawler_url.url.domain)
pass
Expand All @@ -53,31 +70,9 @@ async def retrieve(self) -> "ProcessBase":
self.response = response
processor = get_processor(self)
if processor and processor.requires_content:
encoding = response.get_encoding()
self.content = (
await response.content.read(MAX_RESPONSE_SIZE)
).decode(encoding, errors="ignore")
self.content = await get_content(response)
if processor.has_descendants:
processor = get_processor(self)
# text = ""
# soup = None
# processor = None
# if response.status_code < 300 and self.must_be_downloaded(response):
# try:
# text = response.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
# except (RequestException, ReadTimeoutError, socket.timeout) as e:
# self.crawler.current_processed_count += 1
# self.crawler.results.put(Error(self, e))
# self.close()
# return self
# content_type = cgi.parse_header(
# response.headers.get("Content-Type", "")
# )[0]
# soup = (
# BeautifulSoup(text, "html.parser")
# if content_type == "text/html"
# else None
# )
except RequestException as e:
self.crawler.current_processed_count += 1
processor = Error(self, e)
Expand All @@ -104,16 +99,19 @@ def __init__(
self,
crawler: "Crawler",
target_url: str,
depth=3,
source=None,
exists=None,
url_type=None,
depth: int = 3,
source: Optional["CrawlerUrl"] = None,
exists: bool = None,
url_type: Optional[URL_TYPES] = None,
):
"""
:type crawler: Crawler instance
:type target_url: Uniform Resource Identifier as string
:type depth: int maximum depth to crawl respect to the initial url
:type source: CrawlerUrl instance. Optional.
:type exists: bool. If exists is True the path surely exists. Optional.
:type url_type: str. Optional.
"""
self.target_url = target_url
url = Url(target_url)
Expand Down Expand Up @@ -152,7 +150,11 @@ async def retrieve(self):

crawler_url_request = CrawlerUrlRequest(self)
processor = await crawler_url_request.retrieve()
if processor is not None and not isinstance(processor, GenericProcessor):
if (
processor is not None
and not isinstance(processor, GenericProcessor)
and self.url_type not in {"asset", "index_file"}
):
self.crawler.console.print(processor.get_text())
# if self.must_be_downloaded(response):
# processor = get_processor(response, text, self, soup) or GenericProcessor(
Expand All @@ -174,11 +176,10 @@ async def retrieve(self):
and crawler_url_request.response.status < 404
):
self.exists = True
# TODO: uncomment
# await self.add_self_directories(
# True if (not self.maybe_rewrite() and self.exists) else None,
# "directory" if not self.maybe_rewrite() else None,
# )
await self.add_self_directories(
True if (not self.maybe_rewrite() and self.exists) else None,
"directory" if not self.maybe_rewrite() else None,
)

def set_type(self, content_type):
from dirhunt.processors import INDEX_FILES
Expand Down
Loading

0 comments on commit b5a2d14

Please sign in to comment.