cleaning regexes: regroup + extend

adbar · May 25, 2021 · e4edd16 · e4edd16
1 parent f93675a
commit e4edd16
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 7 deletions.
diff --git a/courlan/clean.py b/courlan/clean.py
@@ -16,6 +16,15 @@
                       TARGET_LANG_DE, TARGET_LANG_EN
 
 
+PROTOCOLS = re.compile(r'https?://')
+SELECTION = re.compile(r'(https?://[^">&? ]+?)(?:https?://)|(?:https?://[^/]+?/[^/]+?[&?]u(rl)?=)(https?://[^"> ]+)')
+MIDDLE_URL = re.compile(r'https?://.+?(https?://.+?)(?:https?://|$)')
+
+NETLOC_RE = re.compile(r'(?<=\w):(?:80|443)')
+PATH1 = re.compile(r'/+')
+PATH2 = re.compile(r'^(?:/\.\.(?![^/]))+')
+
+
 def clean_url(url, language=None):
     '''Helper function: chained scrubbing and normalization'''
     try:
@@ -42,15 +51,15 @@ def scrub_url(url):
     #if '"' in link:
     #    link = link.split('"')[0]
     # double/faulty URLs
-    protocols = re.findall(r'https?://', url)
+    protocols = PROTOCOLS.findall(url)
     if len(protocols) > 1 and not 'web.archive.org' in url:
         logging.debug('double url: %s %s', len(protocols), url)
-        match = re.match(r'(https?://[^"> ]+?)(?:https?://)', url)
+        match = SELECTION.match(url)
         if match and validate_url(match.group(1))[0] is True:
-            logging.debug('taking url: %s', url)
             url = match.group(1)
+            logging.debug('taking url: %s', url)
         else:
-            match = re.match(r'https?://.+?(https?://.+?)(?:https?://|$)', url)
+            match = MIDDLE_URL.match(url)
             if match and validate_url(match.group(1))[0] is True:
                 url = match.group(1)
                 logging.debug('taking url: %s', url)
@@ -91,11 +100,11 @@ def normalize_url(parsed_url, strict=False, language=None):
         parsed_url = urlparse(parsed_url)
     # port
     if parsed_url.port is not None and parsed_url.port in (80, 443):
-        parsed_url = parsed_url._replace(netloc=re.sub(r'(?<=\w):(?:80|443)', '', parsed_url.netloc))
+        parsed_url = parsed_url._replace(netloc=NETLOC_RE.sub('', parsed_url.netloc))
     # path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py
-    newpath = re.sub(r'/+', '/', parsed_url.path)
+    newpath = PATH1.sub('/', parsed_url.path)
     # Leading /../'s in the path are removed
-    newpath = re.sub(r'^(?:/\.\.(?![^/]))+', '', newpath)
+    newpath = PATH2.sub('', newpath)
     # fragment
     if strict is True:
         newfragment = ''

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -56,6 +56,9 @@ def test_scrub():
     assert scrub_url('https://g__https://www.dwds.de/') == 'https://www.dwds.de'
     # exception for archive URLs
     assert scrub_url('https://web.archive.org/web/20131021165347/https://www.imdb.com/') == 'https://web.archive.org/web/20131021165347/https://www.imdb.com'
+    # social sharing
+    assert scrub_url('https://twitter.com/share?&text=Le%20sabre%20de%20bambou%20%232&via=NouvellesJapon&url=https://nouvellesdujapon.com/le-sabre-de-bambou-2') == 'https://nouvellesdujapon.com/le-sabre-de-bambou-2'
+    assert scrub_url('https://www.facebook.com/sharer.php?u=https://nouvellesdujapon.com/le-sabre-de-bambou-2') == 'https://nouvellesdujapon.com/le-sabre-de-bambou-2'
 
 
 def test_extension_filter():