shorter code and more tests

adbar · May 25, 2021 · 4721260 · 4721260
1 parent e4edd16
commit 4721260
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 13 deletions.
diff --git a/courlan/core.py b/courlan/core.py
@@ -26,8 +26,7 @@
 LOGGER = logging.getLogger(__name__)
 
 FIND_LINKS_REGEX = re.compile(r'<a [^<>]+?>', re.I)
-HREFLANG_DE_REGEX = re.compile(r'hreflang=["\'](de|x-default)', re.I)
-HREFLANG_EN_REGEX = re.compile(r'hreflang=["\'](en|x-default)', re.I)
+HREFLANG_REGEX = re.compile(r'hreflang=["\']([a-z-]+)', re.I)
 LINK_REGEX = re.compile(r'href=["\']([^ ]+?)["\']', re.I)
 
 
@@ -174,12 +173,12 @@ def extract_links(pagecontent, base_url, external_bool, language=None,
     # extract links
     for link in FIND_LINKS_REGEX.findall(pagecontent):
         # https://en.wikipedia.org/wiki/Hreflang
-        if language in ('de', 'en') and 'hreflang' in link:
-            if language == 'de' and HREFLANG_DE_REGEX.search(link):
-                mymatch = LINK_REGEX.search(link)
-                if mymatch:
-                    candidates.add(mymatch.group(1))
-            elif language == 'en' and HREFLANG_EN_REGEX.search(link):
+        if language is not None and 'hreflang' in link:
+            langmatch = HREFLANG_REGEX.search(link)
+            if langmatch and (
+                langmatch.group(1).startswith(language) or
+                langmatch.group(1) == 'x-default'
+                ):
                 mymatch = LINK_REGEX.search(link)
                 if mymatch:
                     candidates.add(mymatch.group(1))
@@ -198,11 +197,8 @@ def extract_links(pagecontent, base_url, external_bool, language=None,
                             with_redirects=redirects, language=language)
         if checked is None:
             continue
-        # external links
-        if external_bool is True and is_external(link, reference) is True:
-            validlinks.add(checked[0])
-        # internal links
-        elif external_bool is False and is_external(link, reference) is False:
+        # external/internal links
+        if external_bool == is_external(link, reference):
             validlinks.add(checked[0])
     # return
     LOGGER.info('%s links found – %s valid links', len(candidates), len(validlinks))

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -49,6 +49,9 @@ def test_fix_relative():
 
 
 def test_scrub():
+    # clean: scrub + normalize
+    assert clean_url('ø\xaa') == 'øª'
+    # scrub
     assert scrub_url('  https://www.dwds.de') == 'https://www.dwds.de'
     assert scrub_url('<![CDATA[https://www.dwds.de]]>') == 'https://www.dwds.de'
     assert scrub_url('https://www.dwds.de/test?param=test&amp;other=test') == 'https://www.dwds.de/test?param=test&other=test'
@@ -93,6 +96,8 @@ def test_lang_filter():
     assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', None) is True
     assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'de') is False
     assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'fr') is True
+    assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'en') is False
+    assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'es') is False
 
 
 def test_navigation():
@@ -205,11 +210,17 @@ def test_extraction():
     '''test link comparison in HTML'''
     assert len(extract_links(None, 'https://test.com/', False)) == 0
     assert len(extract_links('', 'https://test.com/', False)) == 0
+    # language
     pagecontent = '<html><a href="https://test.com/example" hreflang="de-DE"/></html>'
     assert len(extract_links(pagecontent, 'https://test.com/', False)) == 1
     assert len(extract_links(pagecontent, 'https://test.com/', True)) == 0
     assert len(extract_links(pagecontent, 'https://test.com/', False, language='de')) == 1
     assert len(extract_links(pagecontent, 'https://test.com/', False, language='en')) == 0
+    # x-default
+    pagecontent = '<html><a href="https://test.com/example" hreflang="x-default"/></html>'
+    assert len(extract_links(pagecontent, 'https://test.com/', False, language='de')) == 1
+    assert len(extract_links(pagecontent, 'https://test.com/', False, language='en')) == 1
+    # language + content
     pagecontent = '<html><a hreflang="de-DE" href="https://test.com/example"/><a href="https://test.com/example2"/><a href="https://test.com/example2 ADDITIONAL"/></html>'
     links = extract_links(pagecontent, 'https://test.com/', False)
     assert sorted(links) == ['https://test.com/example', 'https://test.com/example2']