Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve URL boundary with quotations and parentheses #254

Merged
merged 8 commits into from
Aug 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/).

## [7.2.4] - UNRELEASED

### Fixed

- URL boundary to better respect the conventions of human language regarding quotation marks and parentheses ([#130](https://github.com/fhightower/ioc-finder/issues/130))

## [7.2.3] - 2022.07.14

### Fixed
Expand Down
38 changes: 19 additions & 19 deletions ioc_finder/ioc_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
import urllib.parse as urlparse
from typing import Callable, Dict, List, Mapping, Union
from typing import Callable, Dict, Iterable, List, Mapping, Union

import click
import ioc_fanger
Expand Down Expand Up @@ -49,7 +49,7 @@
]


def _deduplicate(indicator_list: List) -> List:
def _deduplicate(indicator_list: Iterable) -> List:
"""Deduplicate the list of observables."""
return list(set(indicator_list))

Expand Down Expand Up @@ -88,6 +88,22 @@ def prepare_text(text: str) -> str:
return text


def _clean_url(url: str) -> str:
"""Clean the given URL, removing common, unwanted characters which are usually not part of the URL."""
# if there is a ")" in the URL and not a "(", remove everything including and after the ")"
if ")" in url and "(" not in url:
url = url.split(")")[0]

# remove `"` and `'` characters from the end of a URL
url = url.rstrip('"').rstrip("'")

# remove `'/>` and `"/>` from the end of a URL (this character string occurs at the end of an HMTL tag with )
url = string_remove_from_end(url, "'/>")
url = string_remove_from_end(url, '"/>')

return url
Comment on lines +91 to +104
Copy link
Owner Author

@fhightower fhightower Aug 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Three changes to note:

  1. Moving this into its own function for easier extensibility in the future
  2. Removing ) before rstriping quotation marks so cases like http://example.com/foo') are properly parsed as http://example.com/foo
  3. Previously, we were just doing url.rstrip(")") to remove a closing paren. at the end of a URL; now we do url = url.split(")")[0] to remove the paren and everything after it.



def parse_urls(text: str, *, parse_urls_without_scheme: bool = True) -> List:
"""."""
if parse_urls_without_scheme:
Expand All @@ -96,24 +112,8 @@ def parse_urls(text: str, *, parse_urls_without_scheme: bool = True) -> List:
url_parse_results = ioc_grammars.url.searchString(text)
urls = _listify(url_parse_results)

clean_urls = []

# clean the url
for url in urls:
# remove `"` and `'` characters from the end of a URL
url = url.rstrip('"').rstrip("'")

# remove a final ')' if there is a '(' in the url
if url.endswith(")") and "(" not in url:
url = url.rstrip(")")

# remove `'/>` and `"/>` from the end of a URL (this character string occurs at the end of an HMTL tag with )
url = string_remove_from_end(url, "'/>")
url = string_remove_from_end(url, '"/>')

clean_urls.append(url)
clean_urls = map(_clean_url, urls)

# return the cleaned urls...
# I deduplicate them again because the structure of the URL may have changed when it was cleaned
return _deduplicate(clean_urls)

Expand Down
16 changes: 7 additions & 9 deletions ioc_finder/ioc_grammars.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,12 @@
url_scheme = one_of(schemes, caseless=True)
port = Word(":", nums, min=2)
url_authority = Combine(Or([complete_email_address, domain_name, ipv4_address, ipv6_address]) + Optional(port)("port"))
# although the ":" character is not valid in url paths,
# The url_path_word characters are taken from https://www.ietf.org/rfc/rfc3986.txt...
# (of particular interest is "Appendix A. Collected ABNF for URI")

# Although the ":" character is not valid in url paths,
# some urls are written with the ":" unencoded so we include it below
url_path_word = Word(alphanums + "-._~!$&'()*+,;:=%")
url_path_word = Word(alphanums + "-._~!$&'()*+,;=:%")
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switch order of characters to align w/ RFC.

url_path = Combine(OneOrMore(MatchFirst([url_path_word, Literal("/")])))
url_query = Word(printables, excludeChars="#\"']")
url_fragment = Word(printables, excludeChars="?\"']")
Expand Down Expand Up @@ -281,9 +284,7 @@ def hasBothOrNeitherAngleBrackets(string):
alphanum_word_start
# we use `Or([Literal("pub-")...` instead of something like `CaselessLiteral("pub-")` b/c...
# we only want to parse "pub" when it is all upper or lowercased (not "pUb" or other, similar variations)
+ Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action(
pyparsing_common.downcase_tokens
)
+ Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action(pyparsing_common.downcase_tokens)
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lint update

+ alphanum_word_end
)

Expand Down Expand Up @@ -324,9 +325,7 @@ def hasBothOrNeitherAngleBrackets(string):

# the mac address grammar was developed from https://en.wikipedia.org/wiki/MAC_address#Notational_conventions
# handles xx:xx:xx:xx:xx:xx or xx-xx-xx-xx-xx-xx
mac_address_16_bit_section = Combine(
(Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2)
)
mac_address_16_bit_section = Combine((Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2))
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lint update

# handles xxxx.xxxx.xxxx
mac_address_32_bit_section = Combine((Word(hexnums, exact=4) + ".") * 2 + Word(hexnums, exact=4))
mac_address_word_start = WordStart(wordChars=alphanums + ":-.")
Expand Down Expand Up @@ -414,7 +413,6 @@ def hasBothOrNeitherAngleBrackets(string):
+ Combine(
one_of(enterprise_attack_techniques, caseless=True).set_parse_action(pyparsing_common.upcase_tokens)
+ Optional(attack_sub_technique)

)
+ alphanum_word_end
)
Expand Down
4 changes: 2 additions & 2 deletions tests/find_iocs_cases/registry_keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,10 @@
"urls": [
"https://citizenlab.ca/2016/05/stealth-falcon-appendices",
"https://citizenlab.ca/2016/05/stealth-falcon/",
"https://citizenlab.ca/about/),",
"https://citizenlab.ca/about/",
"https://docs.microsoft.com/en-us/windows/win32/bits/background-intelligent-transfer-service-portal",
"https://www.reuters.com/investigates/special-report/usa-spying-raven/",
"https://www.secureworks.com/blog/malware-lingers-with-bits).",
"https://www.secureworks.com/blog/malware-lingers-with-bits",
],
"attack_techniques": {
"enterprise": [
Expand Down
24 changes: 24 additions & 0 deletions tests/find_iocs_cases/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,28 @@
{},
id="URL and domains parsed",
),
param(
"Foo https://citizenlab.ca/about/), bar",
{
"urls": ["https://citizenlab.ca/about/"],
},
{"parse_domain_from_url": False},
id="URL boundary w/ ) handled properly",
),
fhightower marked this conversation as resolved.
Show resolved Hide resolved
param(
"DownloadString('https://example[.]com/rdp.ps1');g $I DownloadString(\"https://example[.]com/rdp.ps2\");g $I",
{
"urls": ["https://example.com/rdp.ps1", "https://example.com/rdp.ps2"],
},
{"parse_domain_from_url": False},
id="URL boundary w/ single or double quotes handled properly",
),
param(
"https://example.com/g//foo",
{
"urls": ["https://example.com/g//foo"],
},
{"parse_domain_from_url": False},
id="Consecutive slashes handled properly",
),
]
12 changes: 7 additions & 5 deletions tests/test_ioc_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,11 +192,13 @@ def test_ipv4_cidr_parsing():
def test_registry_key_parsing():
s = r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows HKLM\Software\Microsoft\Windows HKCC\Software\Microsoft\Windows"
iocs = find_iocs(s)
assert sorted([
r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows",
r"HKLM\Software\Microsoft\Windows",
r"HKCC\Software\Microsoft\Windows",
]) == sorted(iocs["registry_key_paths"])
assert sorted(
[
r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows",
r"HKLM\Software\Microsoft\Windows",
r"HKCC\Software\Microsoft\Windows",
]
) == sorted(iocs["registry_key_paths"])
Comment on lines +195 to +201
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lint update



def test_adsense_publisher_id_parsing():
Expand Down