Skip to content

Commit

Permalink
Issue #110: [feature request] add keyword search
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 8, 2023
1 parent 887f9fa commit 423a899
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 7 deletions.
10 changes: 6 additions & 4 deletions dirhunt/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
class Crawler(ThreadPoolExecutor):
urls_info = None

def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None,
progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, interesting_keywords=None,
std=None, progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
not_allow_redirects=False, proxies=None, delay=0, limit=1000, to_file=None, user_agent=None,
cookies=None, headers=None):
if not max_workers and not delay:
Expand All @@ -53,6 +53,7 @@ def __init__(self, max_workers=None, interesting_extensions=None, interesting_fi
self.start_dt = datetime.datetime.now()
self.interesting_extensions = interesting_extensions or []
self.interesting_files = interesting_files or []
self.interesting_keywords = interesting_keywords or []
self.closing = False
self.std = std or None
self.progress_enabled = progress_enabled
Expand Down Expand Up @@ -206,8 +207,9 @@ def options(self):
return {
'interesting_extensions': self.interesting_extensions,
'interesting_files': self.interesting_files,
'timeout': self.interesting_files,
'depth': self.interesting_files,
'interesting_keywords': self.interesting_keywords,
'timeout': self.timeout,
'depth': self.depth,
'not_follow_subdomains': self.not_follow_subdomains,
'exclude_sources': self.exclude_sources,
'not_allow_redirects': self.not_allow_redirects,
Expand Down
9 changes: 6 additions & 3 deletions dirhunt/management.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ def flags_range(flags):
help='The files found with these extensions are interesting')
@click.option('-f', '--interesting-files', callback=comma_separated_files, default=','.join(INTERESTING_FILES),
help='The files with these names are interesting')
@click.option('-k', '--interesting-keywords', callback=comma_separated_files, default='',
help='The files with these keywords in their content are interesting')
@click.option('--stdout-flags', callback=comma_separated_files, default=','.join(STDOUT_FLAGS),
help='Return only in stdout the urls of these flags')
@click.option('--progress-enabled/--progress-disabled', default=None)
Expand All @@ -140,8 +142,8 @@ def flags_range(flags):
help='Add a header to requests in the header:value format.')
@click.option('--version', is_flag=True, callback=print_version,
expose_value=False, is_eager=True)
def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, interesting_files, stdout_flags,
progress_enabled, timeout, max_depth, not_follow_subdomains, exclude_sources, proxies, delay,
def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, interesting_files, interesting_keywords,
stdout_flags, progress_enabled, timeout, max_depth, not_follow_subdomains, exclude_sources, proxies, delay,
not_allow_redirects, limit, to_file, user_agent, cookies, headers):
"""Find web directories without bruteforce
"""
Expand All @@ -157,7 +159,8 @@ def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, in
exclude_flags, include_flags = flags_range(exclude_flags), flags_range(include_flags)
progress_enabled = (sys.stdout.isatty() or sys.stderr.isatty()) if progress_enabled is None else progress_enabled
crawler = Crawler(max_workers=threads, interesting_extensions=interesting_extensions,
interesting_files=interesting_files, std=sys.stdout if sys.stdout.isatty() else sys.stderr,
interesting_files=interesting_files, interesting_keywords=interesting_keywords,
std=sys.stdout if sys.stdout.isatty() else sys.stderr,
progress_enabled=progress_enabled, timeout=timeout, depth=max_depth,
not_follow_subdomains=not_follow_subdomains, exclude_sources=exclude_sources,
not_allow_redirects=not_allow_redirects, proxies=proxies, delay=delay, limit=limit,
Expand Down
15 changes: 15 additions & 0 deletions dirhunt/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def __init__(self, response, crawler_url):
self.status_code = response.status_code
# TODO: procesar otras cosas (css, etc.)
self.crawler_url = crawler_url
self.keywords_found = set()

def search_index_files(self):
if self.crawler_url.type not in ['directory', None]:
Expand All @@ -96,6 +97,13 @@ def search_index_files(self):
self.index_file = url
break

def search_keywords(self, text):
if sys.version_info > (3,) and isinstance(text, bytes):
text = text.decode('utf-8')
for keyword in self.crawler_url.crawler.interesting_keywords:
if keyword in text:
self.keywords_found.add(keyword)

@classmethod
def is_applicable(cls, request, text, crawler_url, soup):
raise NotImplementedError
Expand Down Expand Up @@ -127,6 +135,9 @@ def __str__(self):
if self.index_file:
body += colored('\n Index file found: ', Fore.BLUE)
body += '{}'.format(self.index_file.name)
if self.keywords_found:
body += colored('\n Keywords found: ', Fore.BLUE)
body += ', '.join(self.keywords_found)
return body

def json(self):
Expand Down Expand Up @@ -242,6 +253,7 @@ class ProcessCssStyleSheet(ProcessBase):
def process(self, text, soup=None):
if sys.version_info > (3,) and isinstance(text, bytes):
text = text.decode('utf-8')
self.search_keywords(text)
urls = [full_url_address(url, self.crawler_url.url) for url in re.findall(': *url\(["\']?(.+?)["\']?\)', text)]
for url in urls:
self.add_url(url, depth=0, type='asset')
Expand All @@ -259,6 +271,7 @@ class ProcessJavaScript(ProcessBase):
def process(self, text, soup=None):
if sys.version_info > (3,) and isinstance(text, bytes):
text = text.decode('utf-8')
self.search_keywords(text)
urls = [full_url_address(url[0], self.crawler_url.url)
for url in re.findall(TEXT_PLAIN_PATH_STRING_REGEX, text, re.VERBOSE)]
for url in urls:
Expand All @@ -276,6 +289,7 @@ class ProcessHtmlRequest(ProcessBase):
key_name = 'html'

def process(self, text, soup=None):
self.search_keywords(text)
self.assets(soup)
self.links(soup)
self.search_index_files()
Expand Down Expand Up @@ -334,6 +348,7 @@ class ProcessIndexOfRequest(ProcessHtmlRequest):
index_titles = ('index of', 'directory listing for')

def process(self, text, soup=None):
self.search_keywords(text)
directory_list = get_directory_list(text, self, soup)
links = [link for link in directory_list.get_links(text, soup) if link.is_valid()]
for link in filter(lambda x: x.is_valid() and x.url.endswith('/'), links):
Expand Down

0 comments on commit 423a899

Please sign in to comment.