Skip to content

Commit

Permalink
shorter code and more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed May 25, 2021
1 parent e4edd16 commit 4721260
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 13 deletions.
22 changes: 9 additions & 13 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@
LOGGER = logging.getLogger(__name__)

FIND_LINKS_REGEX = re.compile(r'<a [^<>]+?>', re.I)
HREFLANG_DE_REGEX = re.compile(r'hreflang=["\'](de|x-default)', re.I)
HREFLANG_EN_REGEX = re.compile(r'hreflang=["\'](en|x-default)', re.I)
HREFLANG_REGEX = re.compile(r'hreflang=["\']([a-z-]+)', re.I)
LINK_REGEX = re.compile(r'href=["\']([^ ]+?)["\']', re.I)


Expand Down Expand Up @@ -174,12 +173,12 @@ def extract_links(pagecontent, base_url, external_bool, language=None,
# extract links
for link in FIND_LINKS_REGEX.findall(pagecontent):
# https://en.wikipedia.org/wiki/Hreflang
if language in ('de', 'en') and 'hreflang' in link:
if language == 'de' and HREFLANG_DE_REGEX.search(link):
mymatch = LINK_REGEX.search(link)
if mymatch:
candidates.add(mymatch.group(1))
elif language == 'en' and HREFLANG_EN_REGEX.search(link):
if language is not None and 'hreflang' in link:
langmatch = HREFLANG_REGEX.search(link)
if langmatch and (
langmatch.group(1).startswith(language) or
langmatch.group(1) == 'x-default'
):
mymatch = LINK_REGEX.search(link)
if mymatch:
candidates.add(mymatch.group(1))
Expand All @@ -198,11 +197,8 @@ def extract_links(pagecontent, base_url, external_bool, language=None,
with_redirects=redirects, language=language)
if checked is None:
continue
# external links
if external_bool is True and is_external(link, reference) is True:
validlinks.add(checked[0])
# internal links
elif external_bool is False and is_external(link, reference) is False:
# external/internal links
if external_bool == is_external(link, reference):
validlinks.add(checked[0])
# return
LOGGER.info('%s links found – %s valid links', len(candidates), len(validlinks))
Expand Down
11 changes: 11 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def test_fix_relative():


def test_scrub():
# clean: scrub + normalize
assert clean_url(\xaa') == 'øª'
# scrub
assert scrub_url(' https://www.dwds.de') == 'https://www.dwds.de'
assert scrub_url('<![CDATA[https://www.dwds.de]]>') == 'https://www.dwds.de'
assert scrub_url('https://www.dwds.de/test?param=test&amp;other=test') == 'https://www.dwds.de/test?param=test&other=test'
Expand Down Expand Up @@ -93,6 +96,8 @@ def test_lang_filter():
assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', None) is True
assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'de') is False
assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'fr') is True
assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'en') is False
assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'es') is False


def test_navigation():
Expand Down Expand Up @@ -205,11 +210,17 @@ def test_extraction():
'''test link comparison in HTML'''
assert len(extract_links(None, 'https://test.com/', False)) == 0
assert len(extract_links('', 'https://test.com/', False)) == 0
# language
pagecontent = '<html><a href="https://test.com/example" hreflang="de-DE"/></html>'
assert len(extract_links(pagecontent, 'https://test.com/', False)) == 1
assert len(extract_links(pagecontent, 'https://test.com/', True)) == 0
assert len(extract_links(pagecontent, 'https://test.com/', False, language='de')) == 1
assert len(extract_links(pagecontent, 'https://test.com/', False, language='en')) == 0
# x-default
pagecontent = '<html><a href="https://test.com/example" hreflang="x-default"/></html>'
assert len(extract_links(pagecontent, 'https://test.com/', False, language='de')) == 1
assert len(extract_links(pagecontent, 'https://test.com/', False, language='en')) == 1
# language + content
pagecontent = '<html><a hreflang="de-DE" href="https://test.com/example"/><a href="https://test.com/example2"/><a href="https://test.com/example2 ADDITIONAL"/></html>'
links = extract_links(pagecontent, 'https://test.com/', False)
assert sorted(links) == ['https://test.com/example', 'https://test.com/example2']
Expand Down

0 comments on commit 4721260

Please sign in to comment.