Skip to content

Commit

Permalink
cleaning regexes: regroup + extend
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed May 25, 2021
1 parent f93675a commit e4edd16
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 7 deletions.
23 changes: 16 additions & 7 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@
TARGET_LANG_DE, TARGET_LANG_EN


PROTOCOLS = re.compile(r'https?://')
SELECTION = re.compile(r'(https?://[^">&? ]+?)(?:https?://)|(?:https?://[^/]+?/[^/]+?[&?]u(rl)?=)(https?://[^"> ]+)')
MIDDLE_URL = re.compile(r'https?://.+?(https?://.+?)(?:https?://|$)')

NETLOC_RE = re.compile(r'(?<=\w):(?:80|443)')
PATH1 = re.compile(r'/+')
PATH2 = re.compile(r'^(?:/\.\.(?![^/]))+')


def clean_url(url, language=None):
'''Helper function: chained scrubbing and normalization'''
try:
Expand All @@ -42,15 +51,15 @@ def scrub_url(url):
#if '"' in link:
# link = link.split('"')[0]
# double/faulty URLs
protocols = re.findall(r'https?://', url)
protocols = PROTOCOLS.findall(url)
if len(protocols) > 1 and not 'web.archive.org' in url:
logging.debug('double url: %s %s', len(protocols), url)
match = re.match(r'(https?://[^"> ]+?)(?:https?://)', url)
match = SELECTION.match(url)
if match and validate_url(match.group(1))[0] is True:
logging.debug('taking url: %s', url)
url = match.group(1)
logging.debug('taking url: %s', url)
else:
match = re.match(r'https?://.+?(https?://.+?)(?:https?://|$)', url)
match = MIDDLE_URL.match(url)
if match and validate_url(match.group(1))[0] is True:
url = match.group(1)
logging.debug('taking url: %s', url)
Expand Down Expand Up @@ -91,11 +100,11 @@ def normalize_url(parsed_url, strict=False, language=None):
parsed_url = urlparse(parsed_url)
# port
if parsed_url.port is not None and parsed_url.port in (80, 443):
parsed_url = parsed_url._replace(netloc=re.sub(r'(?<=\w):(?:80|443)', '', parsed_url.netloc))
parsed_url = parsed_url._replace(netloc=NETLOC_RE.sub('', parsed_url.netloc))
# path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py
newpath = re.sub(r'/+', '/', parsed_url.path)
newpath = PATH1.sub('/', parsed_url.path)
# Leading /../'s in the path are removed
newpath = re.sub(r'^(?:/\.\.(?![^/]))+', '', newpath)
newpath = PATH2.sub('', newpath)
# fragment
if strict is True:
newfragment = ''
Expand Down
3 changes: 3 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ def test_scrub():
assert scrub_url('https://g__https://www.dwds.de/') == 'https://www.dwds.de'
# exception for archive URLs
assert scrub_url('https://web.archive.org/web/20131021165347/https://www.imdb.com/') == 'https://web.archive.org/web/20131021165347/https://www.imdb.com'
# social sharing
assert scrub_url('https://twitter.com/share?&text=Le%20sabre%20de%20bambou%20%232&via=NouvellesJapon&url=https://nouvellesdujapon.com/le-sabre-de-bambou-2') == 'https://nouvellesdujapon.com/le-sabre-de-bambou-2'
assert scrub_url('https://www.facebook.com/sharer.php?u=https://nouvellesdujapon.com/le-sabre-de-bambou-2') == 'https://nouvellesdujapon.com/le-sabre-de-bambou-2'


def test_extension_filter():
Expand Down

0 comments on commit e4edd16

Please sign in to comment.