Skip to content

Commit

Permalink
prepare new version
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Sep 21, 2020
1 parent 619853a commit 1c5325f
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 12 deletions.
7 changes: 7 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
## History / Changelog


### 0.2.2

- English and German language filters
- Function to detect external links
- Support for domain blacklisting


### 0.2.1

- Less aggressive strict filters
Expand Down
2 changes: 1 addition & 1 deletion courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def clean_query(parsed_url, strict=False, language=None):
if language == 'de' and found_lang not in TARGET_LANG_DE:
logging.debug('bad lang: %s %s %s', language, qelem, found_lang)
raise ValueError
elif language == 'en' and found_lang not in TARGET_LANG_EN:
if language == 'en' and found_lang not in TARGET_LANG_EN:
logging.debug('bad lang: %s %s %s', language, qelem, found_lang)
raise ValueError
# insert
Expand Down
1 change: 0 additions & 1 deletion courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,3 @@ def is_external(url, reference, ignore_suffix=True):
if domain != ref_domain:
return True
return False

18 changes: 9 additions & 9 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ def extension_filter(component):


def lang_filter(url, language):
'''Heuristic targeting internationalization'''
if language is not None and language in ('de', 'en'):
match = URL_LANG_FILTER.match(url)
if match:
if language == 'de' and match.group(1) not in ('de', 'deu'):
return False
elif language == 'en' and match.group(1) not in ('en', 'eng'):
return False
return True
'''Heuristic targeting internationalization'''
if language is not None and language in ('de', 'en'):
match = URL_LANG_FILTER.match(url)
if match:
if language == 'de' and match.group(1) not in ('de', 'deu'):
return False
if language == 'en' and match.group(1) not in ('en', 'eng'):
return False
return True


def spam_filter(url):
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def readme():

setup(
name='courlan',
version='0.2.1',
version='0.2.2',
description='Clean, filter, normalize, and sample URLs',
long_description=readme(),
classifiers=[
Expand Down Expand Up @@ -59,6 +59,7 @@ def readme():
include_package_data=True,
python_requires='>=3.4',
install_requires=[
'requests',
'tldextract',
],
#extras_require=extras,
Expand Down

0 comments on commit 1c5325f

Please sign in to comment.