Issue #110: [feature request] add keyword search

Nekmo · Aug 8, 2023 · 423a899 · 423a899
1 parent 887f9fa
commit 423a899
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 7 deletions.
diff --git a/dirhunt/crawler.py b/dirhunt/crawler.py
@@ -31,8 +31,8 @@
 class Crawler(ThreadPoolExecutor):
     urls_info = None
 
-    def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None,
-                 progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
+    def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, interesting_keywords=None,
+                 std=None, progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
                  not_allow_redirects=False, proxies=None, delay=0, limit=1000, to_file=None, user_agent=None,
                  cookies=None, headers=None):
         if not max_workers and not delay:
@@ -53,6 +53,7 @@ def __init__(self, max_workers=None, interesting_extensions=None, interesting_fi
         self.start_dt = datetime.datetime.now()
         self.interesting_extensions = interesting_extensions or []
         self.interesting_files = interesting_files or []
+        self.interesting_keywords = interesting_keywords or []
         self.closing = False
         self.std = std or None
         self.progress_enabled = progress_enabled
@@ -206,8 +207,9 @@ def options(self):
         return {
             'interesting_extensions': self.interesting_extensions,
             'interesting_files': self.interesting_files,
-            'timeout': self.interesting_files,
-            'depth': self.interesting_files,
+            'interesting_keywords': self.interesting_keywords,
+            'timeout': self.timeout,
+            'depth': self.depth,
             'not_follow_subdomains': self.not_follow_subdomains,
             'exclude_sources': self.exclude_sources,
             'not_allow_redirects': self.not_allow_redirects,

diff --git a/dirhunt/management.py b/dirhunt/management.py
@@ -116,6 +116,8 @@ def flags_range(flags):
               help='The files found with these extensions are interesting')
 @click.option('-f', '--interesting-files', callback=comma_separated_files, default=','.join(INTERESTING_FILES),
               help='The files with these names are interesting')
+@click.option('-k', '--interesting-keywords', callback=comma_separated_files, default='',
+              help='The files with these keywords in their content are interesting')
 @click.option('--stdout-flags', callback=comma_separated_files, default=','.join(STDOUT_FLAGS),
               help='Return only in stdout the urls of these flags')
 @click.option('--progress-enabled/--progress-disabled', default=None)
@@ -140,8 +142,8 @@ def flags_range(flags):
               help='Add a header to requests in the header:value format.')
 @click.option('--version', is_flag=True, callback=print_version,
               expose_value=False, is_eager=True)
-def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, interesting_files, stdout_flags,
-         progress_enabled, timeout, max_depth, not_follow_subdomains, exclude_sources, proxies, delay,
+def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, interesting_files, interesting_keywords,
+         stdout_flags, progress_enabled, timeout, max_depth, not_follow_subdomains, exclude_sources, proxies, delay,
          not_allow_redirects, limit, to_file, user_agent, cookies, headers):
     """Find web directories without bruteforce
     """
@@ -157,7 +159,8 @@ def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, in
     exclude_flags, include_flags = flags_range(exclude_flags), flags_range(include_flags)
     progress_enabled = (sys.stdout.isatty() or sys.stderr.isatty()) if progress_enabled is None else progress_enabled
     crawler = Crawler(max_workers=threads, interesting_extensions=interesting_extensions,
-                      interesting_files=interesting_files, std=sys.stdout if sys.stdout.isatty() else sys.stderr,
+                      interesting_files=interesting_files, interesting_keywords=interesting_keywords,
+                      std=sys.stdout if sys.stdout.isatty() else sys.stderr,
                       progress_enabled=progress_enabled, timeout=timeout, depth=max_depth,
                       not_follow_subdomains=not_follow_subdomains, exclude_sources=exclude_sources,
                       not_allow_redirects=not_allow_redirects, proxies=proxies, delay=delay, limit=limit,

diff --git a/dirhunt/processors.py b/dirhunt/processors.py
@@ -78,6 +78,7 @@ def __init__(self, response, crawler_url):
             self.status_code = response.status_code
         # TODO: procesar otras cosas (css, etc.)
         self.crawler_url = crawler_url
+        self.keywords_found = set()
 
     def search_index_files(self):
         if self.crawler_url.type not in ['directory', None]:
@@ -96,6 +97,13 @@ def search_index_files(self):
                 self.index_file = url
                 break
 
+    def search_keywords(self, text):
+        if sys.version_info > (3,) and isinstance(text, bytes):
+            text = text.decode('utf-8')
+        for keyword in self.crawler_url.crawler.interesting_keywords:
+            if keyword in text:
+                self.keywords_found.add(keyword)
+
     @classmethod
     def is_applicable(cls, request, text, crawler_url, soup):
         raise NotImplementedError
@@ -127,6 +135,9 @@ def __str__(self):
         if self.index_file:
             body += colored('\n    Index file found: ', Fore.BLUE)
             body += '{}'.format(self.index_file.name)
+        if self.keywords_found:
+            body += colored('\n    Keywords found: ', Fore.BLUE)
+            body += ', '.join(self.keywords_found)
         return body
 
     def json(self):
@@ -242,6 +253,7 @@ class ProcessCssStyleSheet(ProcessBase):
     def process(self, text, soup=None):
         if sys.version_info > (3,) and isinstance(text, bytes):
             text = text.decode('utf-8')
+        self.search_keywords(text)
         urls = [full_url_address(url, self.crawler_url.url) for url in re.findall(': *url\(["\']?(.+?)["\']?\)', text)]
         for url in urls:
             self.add_url(url, depth=0, type='asset')
@@ -259,6 +271,7 @@ class ProcessJavaScript(ProcessBase):
     def process(self, text, soup=None):
         if sys.version_info > (3,) and isinstance(text, bytes):
             text = text.decode('utf-8')
+        self.search_keywords(text)
         urls = [full_url_address(url[0], self.crawler_url.url)
                 for url in re.findall(TEXT_PLAIN_PATH_STRING_REGEX, text, re.VERBOSE)]
         for url in urls:
@@ -276,6 +289,7 @@ class ProcessHtmlRequest(ProcessBase):
     key_name = 'html'
 
     def process(self, text, soup=None):
+        self.search_keywords(text)
         self.assets(soup)
         self.links(soup)
         self.search_index_files()
@@ -334,6 +348,7 @@ class ProcessIndexOfRequest(ProcessHtmlRequest):
     index_titles = ('index of', 'directory listing for')
 
     def process(self, text, soup=None):
+        self.search_keywords(text)
         directory_list = get_directory_list(text, self, soup)
         links = [link for link in directory_list.get_links(text, soup) if link.is_valid()]
         for link in filter(lambda x: x.is_valid() and x.url.endswith('/'), links):