Skip to content

Commit

Permalink
Merge pull request #254 from fhightower/130-url-boundary-fix
Browse files Browse the repository at this point in the history
Improve URL boundary with quotations and parentheses
  • Loading branch information
fhightower authored Aug 20, 2022
2 parents f46e970 + 725a113 commit ee715ec
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 35 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/).

## [7.2.4] - UNRELEASED

### Fixed

- URL boundary to better respect the conventions of human language regarding quotation marks and parentheses ([#130](https://github.com/fhightower/ioc-finder/issues/130))

## [7.2.3] - 2022.07.14

### Fixed
Expand Down
38 changes: 19 additions & 19 deletions ioc_finder/ioc_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
import urllib.parse as urlparse
from typing import Callable, Dict, List, Mapping, Union
from typing import Callable, Dict, Iterable, List, Mapping, Union

import click
import ioc_fanger
Expand Down Expand Up @@ -49,7 +49,7 @@
]


def _deduplicate(indicator_list: List) -> List:
def _deduplicate(indicator_list: Iterable) -> List:
"""Deduplicate the list of observables."""
return list(set(indicator_list))

Expand Down Expand Up @@ -88,6 +88,22 @@ def prepare_text(text: str) -> str:
return text


def _clean_url(url: str) -> str:
"""Clean the given URL, removing common, unwanted characters which are usually not part of the URL."""
# if there is a ")" in the URL and not a "(", remove everything including and after the ")"
if ")" in url and "(" not in url:
url = url.split(")")[0]

# remove `"` and `'` characters from the end of a URL
url = url.rstrip('"').rstrip("'")

# remove `'/>` and `"/>` from the end of a URL (this character string occurs at the end of an HMTL tag with )
url = string_remove_from_end(url, "'/>")
url = string_remove_from_end(url, '"/>')

return url


def parse_urls(text: str, *, parse_urls_without_scheme: bool = True) -> List:
"""."""
if parse_urls_without_scheme:
Expand All @@ -96,24 +112,8 @@ def parse_urls(text: str, *, parse_urls_without_scheme: bool = True) -> List:
url_parse_results = ioc_grammars.url.searchString(text)
urls = _listify(url_parse_results)

clean_urls = []

# clean the url
for url in urls:
# remove `"` and `'` characters from the end of a URL
url = url.rstrip('"').rstrip("'")

# remove a final ')' if there is a '(' in the url
if url.endswith(")") and "(" not in url:
url = url.rstrip(")")

# remove `'/>` and `"/>` from the end of a URL (this character string occurs at the end of an HMTL tag with )
url = string_remove_from_end(url, "'/>")
url = string_remove_from_end(url, '"/>')

clean_urls.append(url)
clean_urls = map(_clean_url, urls)

# return the cleaned urls...
# I deduplicate them again because the structure of the URL may have changed when it was cleaned
return _deduplicate(clean_urls)

Expand Down
16 changes: 7 additions & 9 deletions ioc_finder/ioc_grammars.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,12 @@
url_scheme = one_of(schemes, caseless=True)
port = Word(":", nums, min=2)
url_authority = Combine(Or([complete_email_address, domain_name, ipv4_address, ipv6_address]) + Optional(port)("port"))
# although the ":" character is not valid in url paths,
# The url_path_word characters are taken from https://www.ietf.org/rfc/rfc3986.txt...
# (of particular interest is "Appendix A. Collected ABNF for URI")

# Although the ":" character is not valid in url paths,
# some urls are written with the ":" unencoded so we include it below
url_path_word = Word(alphanums + "-._~!$&'()*+,;:=%")
url_path_word = Word(alphanums + "-._~!$&'()*+,;=:%")
url_path = Combine(OneOrMore(MatchFirst([url_path_word, Literal("/")])))
url_query = Word(printables, excludeChars="#\"']")
url_fragment = Word(printables, excludeChars="?\"']")
Expand Down Expand Up @@ -281,9 +284,7 @@ def hasBothOrNeitherAngleBrackets(string):
alphanum_word_start
# we use `Or([Literal("pub-")...` instead of something like `CaselessLiteral("pub-")` b/c...
# we only want to parse "pub" when it is all upper or lowercased (not "pUb" or other, similar variations)
+ Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action(
pyparsing_common.downcase_tokens
)
+ Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action(pyparsing_common.downcase_tokens)
+ alphanum_word_end
)

Expand Down Expand Up @@ -324,9 +325,7 @@ def hasBothOrNeitherAngleBrackets(string):

# the mac address grammar was developed from https://en.wikipedia.org/wiki/MAC_address#Notational_conventions
# handles xx:xx:xx:xx:xx:xx or xx-xx-xx-xx-xx-xx
mac_address_16_bit_section = Combine(
(Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2)
)
mac_address_16_bit_section = Combine((Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2))
# handles xxxx.xxxx.xxxx
mac_address_32_bit_section = Combine((Word(hexnums, exact=4) + ".") * 2 + Word(hexnums, exact=4))
mac_address_word_start = WordStart(wordChars=alphanums + ":-.")
Expand Down Expand Up @@ -414,7 +413,6 @@ def hasBothOrNeitherAngleBrackets(string):
+ Combine(
one_of(enterprise_attack_techniques, caseless=True).set_parse_action(pyparsing_common.upcase_tokens)
+ Optional(attack_sub_technique)

)
+ alphanum_word_end
)
Expand Down
4 changes: 2 additions & 2 deletions tests/find_iocs_cases/registry_keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,10 @@
"urls": [
"https://citizenlab.ca/2016/05/stealth-falcon-appendices",
"https://citizenlab.ca/2016/05/stealth-falcon/",
"https://citizenlab.ca/about/),",
"https://citizenlab.ca/about/",
"https://docs.microsoft.com/en-us/windows/win32/bits/background-intelligent-transfer-service-portal",
"https://www.reuters.com/investigates/special-report/usa-spying-raven/",
"https://www.secureworks.com/blog/malware-lingers-with-bits).",
"https://www.secureworks.com/blog/malware-lingers-with-bits",
],
"attack_techniques": {
"enterprise": [
Expand Down
24 changes: 24 additions & 0 deletions tests/find_iocs_cases/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,28 @@
{},
id="URL and domains parsed",
),
param(
"Foo https://citizenlab.ca/about/), bar",
{
"urls": ["https://citizenlab.ca/about/"],
},
{"parse_domain_from_url": False},
id="URL boundary w/ ) handled properly",
),
param(
"DownloadString('https://example[.]com/rdp.ps1');g $I DownloadString(\"https://example[.]com/rdp.ps2\");g $I",
{
"urls": ["https://example.com/rdp.ps1", "https://example.com/rdp.ps2"],
},
{"parse_domain_from_url": False},
id="URL boundary w/ single or double quotes handled properly",
),
param(
"https://example.com/g//foo",
{
"urls": ["https://example.com/g//foo"],
},
{"parse_domain_from_url": False},
id="Consecutive slashes handled properly",
),
]
12 changes: 7 additions & 5 deletions tests/test_ioc_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,11 +192,13 @@ def test_ipv4_cidr_parsing():
def test_registry_key_parsing():
s = r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows HKLM\Software\Microsoft\Windows HKCC\Software\Microsoft\Windows"
iocs = find_iocs(s)
assert sorted([
r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows",
r"HKLM\Software\Microsoft\Windows",
r"HKCC\Software\Microsoft\Windows",
]) == sorted(iocs["registry_key_paths"])
assert sorted(
[
r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows",
r"HKLM\Software\Microsoft\Windows",
r"HKCC\Software\Microsoft\Windows",
]
) == sorted(iocs["registry_key_paths"])


def test_adsense_publisher_id_parsing():
Expand Down

0 comments on commit ee715ec

Please sign in to comment.