From b5a2d140b2c5af5abf381c46f0680db9bb6e2c7b Mon Sep 17 00:00:00 2001
From: Nekmo <contacto@nekmo.com>
Date: Wed, 9 Aug 2023 18:43:59 +0200
Subject: [PATCH] Issue #83: Use Asyncio

---
 dirhunt/colors.py      |   4 +
 dirhunt/crawler.py     |  16 ++-
 dirhunt/crawler_url.py |  75 +++++++-------
 dirhunt/processors.py  | 222 ++++++++++++++++++++++++++---------------
 4 files changed, 197 insertions(+), 120 deletions(-)

diff --git a/dirhunt/colors.py b/dirhunt/colors.py
index 983cbba..1ea575a 100644
--- a/dirhunt/colors.py
+++ b/dirhunt/colors.py
@@ -10,6 +10,10 @@ def status_code_colors(status_code):
         return "green3"
     elif 300 <= status_code < 400:
         return "deep_sky_blue1"
+    elif 400 <= status_code < 404 or 404 < status_code < 500:
+        return "deep_pink2"
+    elif 404 == status_code:
+        return "bright_red"
     elif 500 == status_code:
         return "magenta1"
     else:
diff --git a/dirhunt/crawler.py b/dirhunt/crawler.py
index cf4aa73..13a3092 100644
--- a/dirhunt/crawler.py
+++ b/dirhunt/crawler.py
@@ -29,6 +29,7 @@
 from dirhunt.json_report import JsonReportEncoder
 from dirhunt.sessions import Sessions, Session
 from dirhunt.sources import Sources
+from dirhunt.url import Url
 from dirhunt.url_info import UrlsInfo
 
 """Flags importance"""
@@ -82,19 +83,24 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop
     async def start(self):
         """Add urls to process."""
         for url in self.configuration.urls:
-            await self.add_crawler_url(
-                CrawlerUrl(self, url, depth=self.configuration.max_depth)
-            )
+            crawler_url = CrawlerUrl(self, url, depth=self.configuration.max_depth)
+            self.domains.add(crawler_url.url.domain)
+            await self.add_crawler_url(crawler_url)
+
         while self.tasks:
             await asyncio.wait(self.tasks)
+        await self.session.close()
 
     async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Task]:
         """Add crawler_url to tasks"""
-        if crawler_url.url.url in self.crawler_urls:
+        if (
+            crawler_url in self.crawler_urls
+            or crawler_url.url.domain not in self.domains
+        ):
             return
+        self.crawler_urls.add(crawler_url)
         task = self.loop.create_task(crawler_url.retrieve())
         self.tasks.add(task)
-        self.crawler_urls.add(crawler_url)
         task.add_done_callback(self.tasks.discard)
         return task
 
diff --git a/dirhunt/crawler_url.py b/dirhunt/crawler_url.py
index 8a2b9da..c548155 100644
--- a/dirhunt/crawler_url.py
+++ b/dirhunt/crawler_url.py
@@ -1,21 +1,24 @@
 # -*- coding: utf-8 -*-
 import cgi
-import socket
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Literal
 
+from aiohttp import ClientResponse
 from aiohttp.web_response import Response
 from bs4 import BeautifulSoup
 from requests import RequestException
-from urllib3.exceptions import ReadTimeoutError
+import charset_normalizer as chardet
 
 from dirhunt.url import Url
 
+RESPONSE_CHUNK = 1024 * 4
 MAX_RESPONSE_SIZE = 1024 * 512
 FLAGS_WEIGHT = {
     "blank": 4,
     "not_found.fake": 3,
     "html": 2,
 }
+URL_TYPES = Literal["index_file",]  # index.php, index.html, index.htm, etc.
+DEFAULT_ENCODING = "utf-8"
 
 
 if TYPE_CHECKING:
@@ -23,6 +26,21 @@
     from dirhunt.processors import ProcessBase
 
 
+async def get_content(response: "ClientResponse") -> str:
+    try:
+        encoding = response.get_encoding()
+    except RuntimeError:
+        # aiohttp can't detect encoding if the content is not available
+        encoding = None
+    data = b""
+    async for chunk in response.content.iter_chunked(RESPONSE_CHUNK):
+        data += chunk
+        if not chunk or len(data) >= MAX_RESPONSE_SIZE:
+            break
+    encoding = encoding or chardet.detect(data)["encoding"]
+    return data.decode(encoding or DEFAULT_ENCODING, errors="ignore")
+
+
 class CrawlerUrlRequest:
     response = Optional[Response]
     content: Optional[str] = None
@@ -38,7 +56,6 @@ async def retrieve(self) -> "ProcessBase":
             Error,
         )
 
-        text = ""
         try:
             await self.crawler.domain_semaphore.acquire(self.crawler_url.url.domain)
             pass
@@ -53,31 +70,9 @@ async def retrieve(self) -> "ProcessBase":
                 self.response = response
                 processor = get_processor(self)
                 if processor and processor.requires_content:
-                    encoding = response.get_encoding()
-                    self.content = (
-                        await response.content.read(MAX_RESPONSE_SIZE)
-                    ).decode(encoding, errors="ignore")
+                    self.content = await get_content(response)
                 if processor.has_descendants:
                     processor = get_processor(self)
-                # text = ""
-                # soup = None
-                # processor = None
-                # if response.status_code < 300 and self.must_be_downloaded(response):
-                #     try:
-                #         text = response.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
-                #     except (RequestException, ReadTimeoutError, socket.timeout) as e:
-                #         self.crawler.current_processed_count += 1
-                #         self.crawler.results.put(Error(self, e))
-                #         self.close()
-                #         return self
-                #     content_type = cgi.parse_header(
-                #         response.headers.get("Content-Type", "")
-                #     )[0]
-                #     soup = (
-                #         BeautifulSoup(text, "html.parser")
-                #         if content_type == "text/html"
-                #         else None
-                #     )
         except RequestException as e:
             self.crawler.current_processed_count += 1
             processor = Error(self, e)
@@ -104,16 +99,19 @@ def __init__(
         self,
         crawler: "Crawler",
         target_url: str,
-        depth=3,
-        source=None,
-        exists=None,
-        url_type=None,
+        depth: int = 3,
+        source: Optional["CrawlerUrl"] = None,
+        exists: bool = None,
+        url_type: Optional[URL_TYPES] = None,
     ):
         """
 
         :type crawler: Crawler instance
         :type target_url: Uniform Resource Identifier as string
         :type depth: int maximum depth to crawl respect to the initial url
+        :type source: CrawlerUrl instance. Optional.
+        :type exists: bool. If exists is True the path surely exists. Optional.
+        :type url_type: str. Optional.
         """
         self.target_url = target_url
         url = Url(target_url)
@@ -152,7 +150,11 @@ async def retrieve(self):
 
         crawler_url_request = CrawlerUrlRequest(self)
         processor = await crawler_url_request.retrieve()
-        if processor is not None and not isinstance(processor, GenericProcessor):
+        if (
+            processor is not None
+            and not isinstance(processor, GenericProcessor)
+            and self.url_type not in {"asset", "index_file"}
+        ):
             self.crawler.console.print(processor.get_text())
         # if self.must_be_downloaded(response):
         #     processor = get_processor(response, text, self, soup) or GenericProcessor(
@@ -174,11 +176,10 @@ async def retrieve(self):
             and crawler_url_request.response.status < 404
         ):
             self.exists = True
-        # TODO: uncomment
-        # await self.add_self_directories(
-        #     True if (not self.maybe_rewrite() and self.exists) else None,
-        #     "directory" if not self.maybe_rewrite() else None,
-        # )
+        await self.add_self_directories(
+            True if (not self.maybe_rewrite() and self.exists) else None,
+            "directory" if not self.maybe_rewrite() else None,
+        )
 
     def set_type(self, content_type):
         from dirhunt.processors import INDEX_FILES
diff --git a/dirhunt/processors.py b/dirhunt/processors.py
index 5a18b49..96c90e0 100644
--- a/dirhunt/processors.py
+++ b/dirhunt/processors.py
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 import re
 import sys
-from typing import List, Type
+from itertools import chain
+from typing import List, Type, Iterator, Optional
 
 from aiohttp.web_response import Response
 from rich.text import Text
@@ -21,7 +22,7 @@
 from dirhunt.url_loop import is_url_loop
 from dirhunt.utils import colored
 
-INDEX_FILES = ["index.php", "index.html", "index.html"]
+INDEX_FILES = ["index.php", "index.html", "index.htm"]
 # Regex for JS. Source: https://github.com/GerbenJavado/LinkFinder/blob/master/linkfinder.py
 TEXT_PLAIN_PATH_STRING_REGEX = r"""
 
@@ -75,7 +76,7 @@ class ProcessBase:
     # to get the correct processor
     has_descendants = False
 
-    def __init__(self, crawler_url_request):
+    def __init__(self, crawler_url_request: "CrawlerUrlRequest"):
         """
         :type crawler_url_request: CrawlerUrlRequest
         """
@@ -86,6 +87,7 @@ def __init__(self, crawler_url_request):
         self.keywords_found = set()
 
     async def search_index_files(self):
+        """Search for index files in the directory. For example index.php, index.html, etc."""
         if self.crawler_url.url_type not in ["directory", None]:
             return
         crawler = self.crawler_url.crawler
@@ -96,9 +98,9 @@ async def search_index_files(self):
                 crawler,
                 url,
                 self.crawler_url.depth - 1,
-                self,
+                self.crawler_url,
                 None,
-                "document",
+                "index_file",
             )
             await self.crawler_url.crawler.add_crawler_url(sub_crawler_url)
             if sub_crawler_url.exists and sub_crawler_url.processor.status_code == 200:
@@ -125,6 +127,7 @@ def maybe_directory(self):
         return self.crawler_url.maybe_directory()
 
     def get_url_line_text(self):
+        """Return a Text object with the url info in a line."""
         text = Text()
         text.append(
             "[{}]".format(self.status_code), status_code_colors(self.status_code)
@@ -133,7 +136,12 @@ def get_url_line_text(self):
         text.append(" ({})".format(self.name or self.__class__.__name__), "gold1")
         return text
 
-    async def add_url(self, url: Url, depth: int = 3, **kwargs):
+    async def add_url(self, url: Url, depth: int = 3, **kwargs) -> None:
+        """Add a new url to the crawler.
+
+        :param url: Url to add
+        :param depth: Depth of the url
+        """
         if is_url_loop(url):
             return
         await self.crawler_url.crawler.add_crawler_url(
@@ -142,7 +150,8 @@ async def add_url(self, url: Url, depth: int = 3, **kwargs):
             )
         )
 
-    def get_text(self):
+    def get_text(self) -> Text:
+        """Return a Text object with the info of the processor."""
         text = self.get_url_line_text()
         if self.index_file:
             text.append("\n    Index file found: ", "blue1")
@@ -152,7 +161,7 @@ def get_text(self):
             text.append(", ".join(self.keywords_found))
         return text
 
-    def json(self):
+    def json(self) -> dict:
         return {
             "processor_class": "{}".format(self.__class__.__name__),
             "status_code": self.status_code,
@@ -202,67 +211,90 @@ def maybe_directory(self):
 
 
 class GenericProcessor(ProcessBase):
+    """Generic processor. It's used when the processor is not found.
+    It's the last processor to be executed. It's always applicable.
+    """
+
     name = "Generic"
     key_name = "generic"
 
-    async def process(self, text, soup=None):
+    async def process(self, crawler_url_request: "CrawlerUrlRequest") -> None:
+        """Process the request. This method will search for index files
+        in the directory.
+        """
         await self.search_index_files()
 
     @classmethod
-    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest") -> bool:
+        """This processor is always applicable."""
         return True
 
 
 class ProcessRedirect(ProcessBase):
+    """Processor for redirects. It's applicable when the response status code is 3xx."""
+
     name = "Redirect"
     key_name = "redirect"
     redirector = None
 
     def __init__(self, crawler_url_request: "CrawlerUrlRequest"):
+        """Initialize the processor."""
         super(ProcessRedirect, self).__init__(crawler_url_request)
         self.redirector = full_url_address(
             crawler_url_request.response.headers.get("Location"), self.crawler_url.url
         )
 
-    async def process(self, crawler_url_request: "CrawlerUrlRequest"):
+    async def process(self, crawler_url_request: "CrawlerUrlRequest") -> None:
+        """Process the request. This method will add the redirector url to the crawler."""
         if not self.crawler_url.crawler.configuration.not_allow_redirects:
             await self.add_url(self.redirector)
 
     @classmethod
-    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest") -> bool:
+        """This processor is applicable when the response status code is 3xx."""
         return (
             crawler_url_request.response is not None
             and 300 <= crawler_url_request.response.status < 400
+            # if we are searching for index files, we don't want to follow redirects
+            and crawler_url_request.crawler_url.url_type != "index_file"
         )
 
-    def __str__(self):
-        body = super(ProcessRedirect, self).__str__()
-        body += colored("\n    Redirect to: ", Fore.BLUE)
-        body += "{}".format(self.redirector.address)
-        return body
+    def get_text(self) -> Text:
+        """Return a Text object with the info of the processor."""
+        text = super(ProcessRedirect, self).get_text()
+        text.append("\n    Redirect to: ", "blue1")
+        text.append("{}".format(self.redirector))
+        return text
 
 
 class ProcessNotFound(ProcessBase):
+    """Processor for 404 errors. It's applicable when the response status code is 404."""
+
     name = "Not Found"
     key_name = "not_found"
 
-    async def process(self, text, soup=None):
+    async def process(self, crawler_url_request: "CrawlerUrlRequest") -> None:
+        """Process the request. This method will search for index files."""
         await self.search_index_files()
 
     @classmethod
-    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest") -> None:
+        """This processor is applicable when the response status code is 404."""
         return (
             crawler_url_request.response is not None
             and crawler_url_request.response.status == 404
         )
 
-    def __str__(self):
-        body = self.url_line()
+    def get_text(self) -> Text:
+        """Return a Text object with the info of the processor."""
+        text = self.get_url_line_text()
         if self.crawler_url.exists:
-            body += colored(" (FAKE 404)", Fore.YELLOW)
+            text.append(" (FAKE 404)", "gold1")
         if self.index_file:
-            body += "\n    Index file found: {}".format(self.index_file.name)
-        return body
+            text.append(
+                "\n    Index file found: {}".format(self.index_file.name), "blue1"
+            )
+        return text
 
     @property
     def flags(self):
@@ -273,24 +305,27 @@ def flags(self):
 
 
 class ProcessCssStyleSheet(ProcessBase):
+    """Processor for CSS stylesheets. It's applicable when the response content type is text/css."""
+
     name = "CSS StyleSheet"
     key_name = "css"
     requires_content = True
 
-    def process(self, text, soup=None):
-        if sys.version_info > (3,) and isinstance(text, bytes):
-            text = text.decode("utf-8")
-        self.search_keywords(text)
+    async def process(self, crawler_url_request: "CrawlerUrlRequest") -> None:
+        """Process the request. This method will search for urls in the CSS stylesheet."""
+        self.search_keywords(crawler_url_request.content)
         urls = [
             full_url_address(url, self.crawler_url.url)
-            for url in re.findall(": *url\([\"']?(.+?)[\"']?\)", text)
+            for url in re.findall(
+                ": *url\([\"']?(.+?)[\"']?\)", crawler_url_request.content
+            )
         ]
         for url in urls:
-            self.add_url(url, depth=0, type="asset")
-        return urls
+            await self.add_url(url, depth=0, url_type="asset")
 
     @classmethod
-    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest") -> bool:
+        """This processor is applicable when the response content type is text/css."""
         return (
             crawler_url_request.response is not None
             and crawler_url_request.response.headers.get("Content-Type", "")
@@ -301,24 +336,27 @@ def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
 
 
 class ProcessJavaScript(ProcessBase):
+    """Processor for JavaScript files. It's applicable when the response content type is application/javascript."""
+
     name = "JavaScript"
     key_name = "js"
     requires_content = True
 
-    def process(self, text, soup=None):
-        if sys.version_info > (3,) and isinstance(text, bytes):
-            text = text.decode("utf-8")
-        self.search_keywords(text)
+    async def process(self, crawler_url_request: "CrawlerUrlRequest") -> None:
+        """Process the request. This method will search for urls in the JavaScript file."""
+        self.search_keywords(crawler_url_request.content)
         urls = [
             full_url_address(url[0], self.crawler_url.url)
-            for url in re.findall(TEXT_PLAIN_PATH_STRING_REGEX, text, re.VERBOSE)
+            for url in re.findall(
+                TEXT_PLAIN_PATH_STRING_REGEX, crawler_url_request.content, re.VERBOSE
+            )
         ]
         for url in urls:
-            self.add_url(url, depth=0, type="asset")
-        return urls
+            await self.add_url(url, depth=0, url_type="asset")
 
     @classmethod
-    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest") -> bool:
+        """This processor is applicable when the response content type is application/javascript."""
         return (
             crawler_url_request.response is not None
             and crawler_url_request.response.headers.get("Content-Type", "")
@@ -329,18 +367,22 @@ def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
 
 
 class ProcessHtmlRequest(ProcessBase):
+    """Processor for HTML documents. It's applicable when the response content type is text/html."""
+
     name = "HTML document"
     key_name = "html"
     requires_content = True
     has_descendants = True
 
-    async def process(self, crawler_url_request: "CrawlerUrlRequest"):
+    async def process(self, crawler_url_request: "CrawlerUrlRequest") -> None:
+        """Process the request. This method will search for urls in the HTML document."""
         self.search_keywords(crawler_url_request.content)
-        self.assets(crawler_url_request.soup)
-        self.links(crawler_url_request.soup)
+        await self.assets(crawler_url_request.soup)
+        await self.links(crawler_url_request.soup)
         await self.search_index_files()
 
-    def links(self, soup):
+    async def links(self, soup) -> None:
+        """Search for links in the HTML document and add them to Crawler."""
         links = [
             full_url_address(link.attrs.get("href"), self.crawler_url.url)
             for link in soup.find_all("a")
@@ -369,9 +411,10 @@ def links(self, soup):
                 depth -= 1
             if depth <= 0:
                 continue
-            self.add_url(link, depth)
+            await self.add_url(link, depth)
 
-    def assets(self, soup):
+    async def assets(self, soup) -> None:
+        """Search for assets in the HTML document and add them to Crawler."""
         assets = [
             full_url_address(link.attrs.get("href"), self.crawler_url.url)
             for link in soup.find_all("link")
@@ -386,9 +429,9 @@ def assets(self, soup):
         ]
         for asset in filter(bool, assets):
             self.analyze_asset(asset)
-            self.add_url(asset, type="asset")
+            await self.add_url(asset, url_type="asset")
 
-    def analyze_asset(self, asset):
+    def analyze_asset(self, asset) -> None:
         """
 
         :type asset: Url
@@ -402,7 +445,7 @@ def analyze_asset(self, asset):
             self.crawler_url.depth -= 1
 
     @classmethod
-    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest") -> bool:
         return (
             crawler_url_request.response is not None
             and crawler_url_request.response.headers.get("Content-Type", "")
@@ -413,6 +456,10 @@ def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
 
 
 class ProcessIndexOfRequest(ProcessHtmlRequest):
+    """Processor for Index Of pages. It's applicable when the response content
+    type is text/html and the page contains a list of links to files and directories.
+    """
+
     name = "Index Of"
     key_name = "index_of"
     files = None
@@ -420,59 +467,74 @@ class ProcessIndexOfRequest(ProcessHtmlRequest):
     requires_content = True
     has_descendants = False
 
-    def process(self, text, soup=None):
-        self.search_keywords(text)
-        directory_list = get_directory_list(text, self, soup)
+    async def process(self, crawler_url_request: "CrawlerUrlRequest") -> None:
+        """Process the request. This method will search for urls in the Index Of page."""
+        self.search_keywords(crawler_url_request.content)
+        directory_list = get_directory_list(
+            crawler_url_request.content, self, crawler_url_request.soup
+        )
         links = [
-            link for link in directory_list.get_links(text, soup) if link.is_valid()
+            link
+            for link in directory_list.get_links(
+                crawler_url_request.content, crawler_url_request.soup
+            )
+            if link.is_valid()
         ]
         for link in filter(lambda x: x.is_valid() and x.url.endswith("/"), links):
-            self.add_url(link, type="directory")
+            await self.add_url(link, url_type="directory")
         self.files = links
 
-    def interesting_ext_files(self):
+    def interesting_ext_files(self) -> Iterator[Url]:
+        """Return a list of files with interesting extensions."""
         return filter(
             lambda x: x.name.split(".")[-1]
-            in self.crawler_url.crawler.interesting_extensions,
+            in self.crawler_url.crawler.configuration.interesting_extensions,
             self.files,
         )
 
-    def interesting_name_files(self):
+    def interesting_name_files(self) -> Iterator[Url]:
+        """Return a list of files with interesting names."""
         return filter(
-            lambda x: x.name in self.crawler_url.crawler.interesting_files, self.files
+            lambda x: x.name
+            in self.crawler_url.crawler.configuration.interesting_files,
+            self.files,
         )
 
-    def interesting_files(self):
-        for iterator in [self.interesting_ext_files(), self.interesting_name_files()]:
-            for file in iterator:
-                yield file
+    def interesting_files(self) -> Iterator[Url]:
+        """Return a list of files with interesting extensions or names."""
+        return chain(self.interesting_ext_files(), self.interesting_name_files())
 
-    def __str__(self):
-        body = super(ProcessIndexOfRequest, self).__str__()
+    def get_text(self) -> Text:
+        """Return the text to be displayed in the console."""
+        text = super(ProcessIndexOfRequest, self).get_text()
         ext_files = list(self.interesting_ext_files())
         name_files = list(self.interesting_name_files())
         if ext_files:
-            body += colored("\n    Interesting extension files:", Fore.BLUE)
-            body += " {}".format(", ".join(map(lambda x: self.repr_file(x), ext_files)))
+            text.append("\n    Interesting extension files:", "blue1")
+            text.append(
+                " {}".format(", ".join(map(lambda x: self.repr_file(x), ext_files)))
+            )
         if name_files:
-            body += colored("\n    Interesting file names:", Fore.MAGENTA)
-            body += " {}".format(
-                ", ".join(map(lambda x: self.repr_file(x), name_files))
+            text.append("\n    Interesting file names:", "deep_sky_blue1")
+            text.append(
+                " {}".format(", ".join(map(lambda x: self.repr_file(x), name_files)))
             )
         if not ext_files and not name_files:
-            body += colored(" (Nothing interesting)", Fore.LIGHTYELLOW_EX)
-        return body
+            text.append(" (Nothing interesting)", "gold1")
+        return text
 
     @classmethod
-    def repr_file(cls, file):
-        text = file.name
+    def repr_file(cls, file) -> str:
+        """Return a string representation of a file."""
+        body = file.name
         created_at, filesize = file.extra.get("created_at"), file.extra.get("filesize")
         if created_at or filesize:
-            text += " ({})".format(" ⚫ ".join(filter(bool, [created_at, filesize])))
-        return text
+            body += " ({})".format(" ⚫ ".join(filter(bool, [created_at, filesize])))
+        return body
 
     @classmethod
-    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest") -> bool:
+        """Return True if the request is applicable to this processor."""
         if (
             not super(ProcessIndexOfRequest, cls).is_applicable(crawler_url_request)
             or crawler_url_request.content is None
@@ -498,13 +560,16 @@ def flags(self):
 
 
 class ProcessBlankPageRequest(ProcessHtmlRequest):
+    """Processor for blank pages. It's applicable when the response content is empty."""
+
     name = "Blank page"
     key_name = "blank"
     requires_content = True
     has_descendants = False
 
     @classmethod
-    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest") -> bool:
+        """Return True if the request is applicable to this processor."""
         if (
             not super(ProcessBlankPageRequest, cls).is_applicable(crawler_url_request)
             or crawler_url_request.content is None
@@ -533,7 +598,8 @@ def tag_visible(element):
         return True
 
 
-def get_processor(crawler_url_request: "CrawlerUrlRequest"):
+def get_processor(crawler_url_request: "CrawlerUrlRequest") -> Optional["ProcessBase"]:
+    """Return the processor for the given request."""
     for processor_class in PROCESSORS:
         if processor_class.is_applicable(crawler_url_request):
             return processor_class(crawler_url_request)