Skip to content

Commit

Permalink
maintenance: replace langcodes by babel (#89)
Browse files Browse the repository at this point in the history
* maintenance: replace langcodes by babel

* add tests

* fix mypy for now

* simplify
  • Loading branch information
adbar committed Apr 22, 2024
1 parent 536607d commit accbb1b
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 18 deletions.
28 changes: 12 additions & 16 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import Any, Optional, Tuple
from urllib.parse import urlsplit

from langcodes import Language, tag_is_valid
from babel import Locale, UnknownLocaleError # type: ignore

from .langinfo import COUNTRY_CODES, LANGUAGE_CODES

Expand Down Expand Up @@ -181,23 +181,19 @@ def extension_filter(urlpath: str) -> bool:


def langcodes_score(language: str, segment: str, score: int) -> int:
"""Use langcodes on selected URL segments and integrate
them into a score."""
# see also: https://babel.pocoo.org/en/latest/locale.html
"""Use language codes or locale parser on selected URL segments and
integrate them into a score."""
# test if the code looks like a country or a language
if segment[:2] not in COUNTRY_CODES and segment[:2] not in LANGUAGE_CODES:
return score
# test if tag is valid (caution: private codes are)
if tag_is_valid(segment):
# try to identify language code
identified = Language.get(segment).language
# see if it matches
if identified is not None:
LOGGER.debug("langcode %s found in URL segment %s", identified, segment)
if identified != language:
score -= 1
else:
beginning = segment[:2]
if beginning in LANGUAGE_CODES or beginning in COUNTRY_CODES:
# use locale parser
try:
if Locale.parse(segment).language == language:
score += 1
else:
score -= 1
except UnknownLocaleError:
pass
return score


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def get_long_description():
include_package_data=True,
python_requires=">=3.6",
install_requires=[
"langcodes >= 3.3.0",
"babel >= 2.11.0",
"tld == 0.12.6; python_version < '3.7'",
"tld >= 0.13; python_version >= '3.7'",
"urllib3 >= 1.26, < 2; python_version < '3.7'",
Expand Down
12 changes: 11 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@
lang_filter,
)
from courlan.core import filter_links
from courlan.filters import domain_filter, extension_filter, path_filter, type_filter
from courlan.filters import (
domain_filter,
extension_filter,
langcodes_score,
path_filter,
type_filter,
)
from courlan.meta import clear_caches
from courlan.urlutils import _parse, get_tldinfo, is_known_link

Expand Down Expand Up @@ -440,6 +446,10 @@ def test_lang_filter():
lang_filter("http://bz.berlin1.de/kino/050513/fans.html", "de", strict=True)
is False
)
assert langcodes_score("en", "en_HK", 0) == 1
assert langcodes_score("en", "en_XY", 0) == 0
assert langcodes_score("en", "de_DE", 0) == -1

# assert lang_filter('http://www.verfassungen.de/ch/basel/verf03.htm'. 'de') is True
# assert lang_filter('http://www.uni-stuttgart.de/hi/fnz/lehrveranst.html', 'de') is True
# http://www.wildwechsel.de/ww/front_content.php?idcatart=177&lang=4&client=6&a=view&eintrag=100&a=view&eintrag=0&a=view&eintrag=20&a=view&eintrag=80&a=view&eintrag=20
Expand Down

0 comments on commit accbb1b

Please sign in to comment.