Skip to content

Commit

Permalink
maintenance: simplify regexes (#56)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Aug 31, 2023
1 parent 8292100 commit ac6589e
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 6 deletions.
1 change: 1 addition & 0 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
SELECTION = re.compile(
r'(https?://[^">&? ]+?)(?:https?://)|(?:https?://[^/]+?/[^/]+?[&?]u(rl)?=)(https?://[^"> ]+)'
)

MIDDLE_URL = re.compile(r"https?://.+?(https?://.+?)(?:https?://|$)")
NETLOC_RE = re.compile(r"(?<=\w):(?:80|443)")

Expand Down
2 changes: 1 addition & 1 deletion courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

FIND_LINKS_REGEX = re.compile(r"<a [^<>]+?>", re.I)
HREFLANG_REGEX = re.compile(r'hreflang=["\']?([a-z-]+)', re.I)
LINK_REGEX = re.compile(r'href=["\']?([^ ]+?)(["\']|[ >])', re.I)
LINK_REGEX = re.compile(r'href=["\']?([^ ]+?)(["\' >])', re.I)


def check_url(
Expand Down
3 changes: 2 additions & 1 deletion courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
# domain/host names
UNSUITABLE_DOMAIN = re.compile(r"[?=`$;,]|:$|\.(.|[0-9]+|[^.]{25,})$")


# content filters
SITE_STRUCTURE = re.compile(
# wordpress
Expand All @@ -41,7 +42,7 @@
re.IGNORECASE,
) # (?=[&?])
ADULT_AND_VIDEOS = re.compile(
r"[/_-](?:bild\-?kontakte|fick|gangbang|incest|live-?chat|live-?cams?|porno?|sexyeroti[ck]|sexcam|swinger|xxx)\b",
r"[/_-](?:bild-?kontakte|fick|gangbang|incest|live-?cams?|live-?chat|porno?|sexcam|sexyeroti[ck]|swinger|x{3})\b",
re.IGNORECASE,
)

Expand Down
8 changes: 4 additions & 4 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@


DOMAIN_REGEX = re.compile(
r"(?:http|ftp)s?://" # protocols
r"(?:(?:f|ht)tp)s?://" # protocols
r"(?:[^/?#]{,63}\.)?" # subdomain, www, etc.
r"([^/?#.]{4,63}\.[^/?#]{2,63}|" # domain and extension
r"[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|" # IPv4
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|" # IPv4
r"[0-9a-f:]{16,})" # IPv6
r"(?:/|$)" # slash or end of string
)
NO_EXTENSION_REGEX = re.compile(r"(^[^.]+)")
STRIP_DOMAIN_REGEX = re.compile(r"^.+?:.*?@|(?<=[^0-9]):[0-9]+")
STRIP_DOMAIN_REGEX = re.compile(r"^.+?:.*?@|(?<=\D):\d+")
CLEAN_FLD_REGEX = re.compile(r"^www[0-9]*\.")
INNER_SLASH_REGEX = re.compile(r"(.+/)+")
FEED_WHITELIST_REGEX = re.compile(r"(feedburner|feedproxy)", re.I)
FEED_WHITELIST_REGEX = re.compile(r"(?:feed(?:burner|proxy))", re.I)


@lru_cache(maxsize=1024)
Expand Down

0 comments on commit ac6589e

Please sign in to comment.