Merge pull request #254 from fhightower/130-url-boundary-fix

Improve URL boundary with quotations and parentheses
fhightower · Aug 20, 2022 · ee715ec · ee715ec
2 parents f46e970 + 725a113
commit ee715ec
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/).
 
+## [7.2.4] - UNRELEASED
+
+### Fixed
+
+- URL boundary to better respect the conventions of human language regarding quotation marks and parentheses ([#130](https://github.com/fhightower/ioc-finder/issues/130))
+
 ## [7.2.3] - 2022.07.14
 
 ### Fixed

diff --git a/ioc_finder/ioc_finder.py b/ioc_finder/ioc_finder.py
@@ -2,7 +2,7 @@
 
 import json
 import urllib.parse as urlparse
-from typing import Callable, Dict, List, Mapping, Union
+from typing import Callable, Dict, Iterable, List, Mapping, Union
 
 import click
 import ioc_fanger
@@ -49,7 +49,7 @@
 ]
 
 
-def _deduplicate(indicator_list: List) -> List:
+def _deduplicate(indicator_list: Iterable) -> List:
     """Deduplicate the list of observables."""
     return list(set(indicator_list))
 
@@ -88,6 +88,22 @@ def prepare_text(text: str) -> str:
     return text
 
 
+def _clean_url(url: str) -> str:
+    """Clean the given URL, removing common, unwanted characters which are usually not part of the URL."""
+    # if there is a ")" in the URL and not a "(", remove everything including and after the ")"
+    if ")" in url and "(" not in url:
+        url = url.split(")")[0]
+
+    # remove `"` and `'` characters from the end of a URL
+    url = url.rstrip('"').rstrip("'")
+
+    # remove `'/>` and `"/>` from the end of a URL (this character string occurs at the end of an HMTL tag with )
+    url = string_remove_from_end(url, "'/>")
+    url = string_remove_from_end(url, '"/>')
+
+    return url
+
+
 def parse_urls(text: str, *, parse_urls_without_scheme: bool = True) -> List:
     """."""
     if parse_urls_without_scheme:
@@ -96,24 +112,8 @@ def parse_urls(text: str, *, parse_urls_without_scheme: bool = True) -> List:
         url_parse_results = ioc_grammars.url.searchString(text)
     urls = _listify(url_parse_results)
 
-    clean_urls = []
-
-    # clean the url
-    for url in urls:
-        # remove `"` and `'` characters from the end of a URL
-        url = url.rstrip('"').rstrip("'")
-
-        # remove a final ')' if there is a '(' in the url
-        if url.endswith(")") and "(" not in url:
-            url = url.rstrip(")")
-
-        # remove `'/>` and `"/>` from the end of a URL (this character string occurs at the end of an HMTL tag with )
-        url = string_remove_from_end(url, "'/>")
-        url = string_remove_from_end(url, '"/>')
-
-        clean_urls.append(url)
+    clean_urls = map(_clean_url, urls)
 
-    # return the cleaned urls...
     # I deduplicate them again because the structure of the URL may have changed when it was cleaned
     return _deduplicate(clean_urls)
 

diff --git a/ioc_finder/ioc_grammars.py b/ioc_finder/ioc_grammars.py
@@ -112,9 +112,12 @@
 url_scheme = one_of(schemes, caseless=True)
 port = Word(":", nums, min=2)
 url_authority = Combine(Or([complete_email_address, domain_name, ipv4_address, ipv6_address]) + Optional(port)("port"))
-# although the ":" character is not valid in url paths,
+# The url_path_word characters are taken from https://www.ietf.org/rfc/rfc3986.txt...
+# (of particular interest is "Appendix A.  Collected ABNF for URI")
+
+# Although the ":" character is not valid in url paths,
 # some urls are written with the ":" unencoded so we include it below
-url_path_word = Word(alphanums + "-._~!$&'()*+,;:=%")
+url_path_word = Word(alphanums + "-._~!$&'()*+,;=:%")
 url_path = Combine(OneOrMore(MatchFirst([url_path_word, Literal("/")])))
 url_query = Word(printables, excludeChars="#\"']")
 url_fragment = Word(printables, excludeChars="?\"']")
@@ -281,9 +284,7 @@ def hasBothOrNeitherAngleBrackets(string):
     alphanum_word_start
     # we use `Or([Literal("pub-")...` instead of something like `CaselessLiteral("pub-")` b/c...
     # we only want to parse "pub" when it is all upper or lowercased (not "pUb" or other, similar variations)
-    + Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action(
-        pyparsing_common.downcase_tokens
-    )
+    + Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action(pyparsing_common.downcase_tokens)
     + alphanum_word_end
 )
 
@@ -324,9 +325,7 @@ def hasBothOrNeitherAngleBrackets(string):
 
 # the mac address grammar was developed from https://en.wikipedia.org/wiki/MAC_address#Notational_conventions
 # handles xx:xx:xx:xx:xx:xx or xx-xx-xx-xx-xx-xx
-mac_address_16_bit_section = Combine(
-    (Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2)
-)
+mac_address_16_bit_section = Combine((Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2))
 # handles xxxx.xxxx.xxxx
 mac_address_32_bit_section = Combine((Word(hexnums, exact=4) + ".") * 2 + Word(hexnums, exact=4))
 mac_address_word_start = WordStart(wordChars=alphanums + ":-.")
@@ -414,7 +413,6 @@ def hasBothOrNeitherAngleBrackets(string):
     + Combine(
         one_of(enterprise_attack_techniques, caseless=True).set_parse_action(pyparsing_common.upcase_tokens)
         + Optional(attack_sub_technique)
-
     )
     + alphanum_word_end
 )

diff --git a/tests/find_iocs_cases/registry_keys.py b/tests/find_iocs_cases/registry_keys.py
@@ -296,10 +296,10 @@
             "urls": [
                 "https://citizenlab.ca/2016/05/stealth-falcon-appendices",
                 "https://citizenlab.ca/2016/05/stealth-falcon/",
-                "https://citizenlab.ca/about/),",
+                "https://citizenlab.ca/about/",
                 "https://docs.microsoft.com/en-us/windows/win32/bits/background-intelligent-transfer-service-portal",
                 "https://www.reuters.com/investigates/special-report/usa-spying-raven/",
-                "https://www.secureworks.com/blog/malware-lingers-with-bits).",
+                "https://www.secureworks.com/blog/malware-lingers-with-bits",
             ],
             "attack_techniques": {
                 "enterprise": [

diff --git a/tests/find_iocs_cases/urls.py b/tests/find_iocs_cases/urls.py
@@ -10,4 +10,28 @@
         {},
         id="URL and domains parsed",
     ),
+    param(
+        "Foo https://citizenlab.ca/about/), bar",
+        {
+            "urls": ["https://citizenlab.ca/about/"],
+        },
+        {"parse_domain_from_url": False},
+        id="URL boundary w/ ) handled properly",
+    ),
+    param(
+        "DownloadString('https://example[.]com/rdp.ps1');g $I DownloadString(\"https://example[.]com/rdp.ps2\");g $I",
+        {
+            "urls": ["https://example.com/rdp.ps1", "https://example.com/rdp.ps2"],
+        },
+        {"parse_domain_from_url": False},
+        id="URL boundary w/ single or double quotes handled properly",
+    ),
+    param(
+        "https://example.com/g//foo",
+        {
+            "urls": ["https://example.com/g//foo"],
+        },
+        {"parse_domain_from_url": False},
+        id="Consecutive slashes handled properly",
+    ),
 ]
diff --git a/tests/test_ioc_finder.py b/tests/test_ioc_finder.py
@@ -192,11 +192,13 @@ def test_ipv4_cidr_parsing():
 def test_registry_key_parsing():
     s = r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows HKLM\Software\Microsoft\Windows HKCC\Software\Microsoft\Windows"
     iocs = find_iocs(s)
-    assert sorted([
-        r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows",
-        r"HKLM\Software\Microsoft\Windows",
-        r"HKCC\Software\Microsoft\Windows",
-    ]) == sorted(iocs["registry_key_paths"])
+    assert sorted(
+        [
+            r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows",
+            r"HKLM\Software\Microsoft\Windows",
+            r"HKCC\Software\Microsoft\Windows",
+        ]
+    ) == sorted(iocs["registry_key_paths"])
 
 
 def test_adsense_publisher_id_parsing():