Issue #83: Use Asyncio

Nekmo · Aug 9, 2023 · b5a2d14 · b5a2d14
1 parent c5ba130
commit b5a2d14
Show file tree

Hide file tree

Showing 4 changed files with 197 additions and 120 deletions.
diff --git a/dirhunt/colors.py b/dirhunt/colors.py
@@ -10,6 +10,10 @@ def status_code_colors(status_code):
         return "green3"
     elif 300 <= status_code < 400:
         return "deep_sky_blue1"
+    elif 400 <= status_code < 404 or 404 < status_code < 500:
+        return "deep_pink2"
+    elif 404 == status_code:
+        return "bright_red"
     elif 500 == status_code:
         return "magenta1"
     else:

diff --git a/dirhunt/crawler.py b/dirhunt/crawler.py
@@ -29,6 +29,7 @@
 from dirhunt.json_report import JsonReportEncoder
 from dirhunt.sessions import Sessions, Session
 from dirhunt.sources import Sources
+from dirhunt.url import Url
 from dirhunt.url_info import UrlsInfo
 
 """Flags importance"""
@@ -82,19 +83,24 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop
     async def start(self):
         """Add urls to process."""
         for url in self.configuration.urls:
-            await self.add_crawler_url(
-                CrawlerUrl(self, url, depth=self.configuration.max_depth)
-            )
+            crawler_url = CrawlerUrl(self, url, depth=self.configuration.max_depth)
+            self.domains.add(crawler_url.url.domain)
+            await self.add_crawler_url(crawler_url)
+
         while self.tasks:
             await asyncio.wait(self.tasks)
+        await self.session.close()
 
     async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Task]:
         """Add crawler_url to tasks"""
-        if crawler_url.url.url in self.crawler_urls:
+        if (
+            crawler_url in self.crawler_urls
+            or crawler_url.url.domain not in self.domains
+        ):
             return
+        self.crawler_urls.add(crawler_url)
         task = self.loop.create_task(crawler_url.retrieve())
         self.tasks.add(task)
-        self.crawler_urls.add(crawler_url)
         task.add_done_callback(self.tasks.discard)
         return task
 

diff --git a/dirhunt/crawler_url.py b/dirhunt/crawler_url.py
@@ -1,28 +1,46 @@
 # -*- coding: utf-8 -*-
 import cgi
-import socket
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Literal
 
+from aiohttp import ClientResponse
 from aiohttp.web_response import Response
 from bs4 import BeautifulSoup
 from requests import RequestException
-from urllib3.exceptions import ReadTimeoutError
+import charset_normalizer as chardet
 
 from dirhunt.url import Url
 
+RESPONSE_CHUNK = 1024 * 4
 MAX_RESPONSE_SIZE = 1024 * 512
 FLAGS_WEIGHT = {
     "blank": 4,
     "not_found.fake": 3,
     "html": 2,
 }
+URL_TYPES = Literal["index_file",]  # index.php, index.html, index.htm, etc.
+DEFAULT_ENCODING = "utf-8"
 
 
 if TYPE_CHECKING:
     from dirhunt.crawler import Crawler
     from dirhunt.processors import ProcessBase
 
 
+async def get_content(response: "ClientResponse") -> str:
+    try:
+        encoding = response.get_encoding()
+    except RuntimeError:
+        # aiohttp can't detect encoding if the content is not available
+        encoding = None
+    data = b""
+    async for chunk in response.content.iter_chunked(RESPONSE_CHUNK):
+        data += chunk
+        if not chunk or len(data) >= MAX_RESPONSE_SIZE:
+            break
+    encoding = encoding or chardet.detect(data)["encoding"]
+    return data.decode(encoding or DEFAULT_ENCODING, errors="ignore")
+
+
 class CrawlerUrlRequest:
     response = Optional[Response]
     content: Optional[str] = None
@@ -38,7 +56,6 @@ async def retrieve(self) -> "ProcessBase":
             Error,
         )
 
-        text = ""
         try:
             await self.crawler.domain_semaphore.acquire(self.crawler_url.url.domain)
             pass
@@ -53,31 +70,9 @@ async def retrieve(self) -> "ProcessBase":
                 self.response = response
                 processor = get_processor(self)
                 if processor and processor.requires_content:
-                    encoding = response.get_encoding()
-                    self.content = (
-                        await response.content.read(MAX_RESPONSE_SIZE)
-                    ).decode(encoding, errors="ignore")
+                    self.content = await get_content(response)
                 if processor.has_descendants:
                     processor = get_processor(self)
-                # text = ""
-                # soup = None
-                # processor = None
-                # if response.status_code < 300 and self.must_be_downloaded(response):
-                #     try:
-                #         text = response.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
-                #     except (RequestException, ReadTimeoutError, socket.timeout) as e:
-                #         self.crawler.current_processed_count += 1
-                #         self.crawler.results.put(Error(self, e))
-                #         self.close()
-                #         return self
-                #     content_type = cgi.parse_header(
-                #         response.headers.get("Content-Type", "")
-                #     )[0]
-                #     soup = (
-                #         BeautifulSoup(text, "html.parser")
-                #         if content_type == "text/html"
-                #         else None
-                #     )
         except RequestException as e:
             self.crawler.current_processed_count += 1
             processor = Error(self, e)
@@ -104,16 +99,19 @@ def __init__(
         self,
         crawler: "Crawler",
         target_url: str,
-        depth=3,
-        source=None,
-        exists=None,
-        url_type=None,
+        depth: int = 3,
+        source: Optional["CrawlerUrl"] = None,
+        exists: bool = None,
+        url_type: Optional[URL_TYPES] = None,
     ):
         """
 
         :type crawler: Crawler instance
         :type target_url: Uniform Resource Identifier as string
         :type depth: int maximum depth to crawl respect to the initial url
+        :type source: CrawlerUrl instance. Optional.
+        :type exists: bool. If exists is True the path surely exists. Optional.
+        :type url_type: str. Optional.
         """
         self.target_url = target_url
         url = Url(target_url)
@@ -152,7 +150,11 @@ async def retrieve(self):
 
         crawler_url_request = CrawlerUrlRequest(self)
         processor = await crawler_url_request.retrieve()
-        if processor is not None and not isinstance(processor, GenericProcessor):
+        if (
+            processor is not None
+            and not isinstance(processor, GenericProcessor)
+            and self.url_type not in {"asset", "index_file"}
+        ):
             self.crawler.console.print(processor.get_text())
         # if self.must_be_downloaded(response):
         #     processor = get_processor(response, text, self, soup) or GenericProcessor(
@@ -174,11 +176,10 @@ async def retrieve(self):
             and crawler_url_request.response.status < 404
         ):
             self.exists = True
-        # TODO: uncomment
-        # await self.add_self_directories(
-        #     True if (not self.maybe_rewrite() and self.exists) else None,
-        #     "directory" if not self.maybe_rewrite() else None,
-        # )
+        await self.add_self_directories(
+            True if (not self.maybe_rewrite() and self.exists) else None,
+            "directory" if not self.maybe_rewrite() else None,
+        )
 
     def set_type(self, content_type):
         from dirhunt.processors import INDEX_FILES