Skip to content

Commit

Permalink
reviewed filter sensitivity
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Sep 2, 2020
1 parent ab5647e commit 4da1b15
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
6 changes: 3 additions & 3 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
from urllib.parse import urlparse


WORDPRESS_FILTER = re.compile(r'/(?:tags?|schlagwort|category|cat|kategorie|kat|auth?or|page|seite|user|search|gallery|gallerie|labels|archives|uploads|modules|attachment)/', re.IGNORECASE)
WORDPRESS_FILTER = re.compile(r'/(?:tags?|schlagwort|category|cat|kategorie|kat|auth?or|page|seite|user|search|gallery|gall?erie|labels|archives|uploads|modules|attachment)/', re.IGNORECASE)
PARAM_FILTER = re.compile(r'\.(atom|json|css|xml|js|jpg|jpeg|png|gif|tiff|pdf|ogg|mp3|m4a|aac|avi|mp4|mov|webm|flv|ico|pls|zip|tar|gz|iso|swf)\b', re.IGNORECASE) # , re.IGNORECASE (?=[&?])
PATH_FILTER = re.compile(r'(impressum|index)(\.html)?', re.IGNORECASE)
PATH_FILTER = re.compile(r'\.[a-z]{2,5}/(impressum|index)(\.html?|\.php)?$', re.IGNORECASE)
ADULT_FILTER = re.compile(r'\b(?:adult|amateur|cams?|gangbang|incest|sexyeroti[ck]|sexcam|bild\-?kontakte)\b|\b(?:arsch|fick|porno?)|(?:cash|swinger)\b', re.IGNORECASE)


Expand All @@ -26,7 +26,7 @@ def basic_filter(url):

def extension_filter(component):
'''Filter based on file extension'''
if re.search(r'\.[a-z]{2,5}$', component) and not component.endswith(('.asp', '.cfm', '.cgi', '.htm', 'html', '.jsp', '.php', '.pl')):
if re.search(r'\.[a-z]{2,5}$', component) and not component.endswith(('.amp', '.asp', '.aspx', '.cfm', '.cgi', '.htm', 'html', '.jsp', '.php', '.pl')):
return False
return True

Expand Down
8 changes: 8 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def test_scrub():

def test_extension_filter():
assert extension_filter('http://www.example.org/test.js') is False
assert extension_filter('http://goodbasic.com/GirlInfo.aspx?Pseudo=MilfJanett') is True
assert extension_filter('https://www.familienrecht-allgaeu.de/de/vermoegensrecht.amp') is True


def test_spam_filter():
Expand All @@ -48,6 +50,9 @@ def test_type_filter():
assert type_filter('http://my-videos.com/') is False
assert type_filter('http://www.example.org/index', strict=True) is False
assert type_filter('http://www.example.org/index.html', strict=True) is False
assert type_filter('http://concordia-hagen.de/impressum.html', strict=True) is False
assert type_filter('http://parkkralle.de/detail/index/sArticle/2704', strict=True) is True
assert type_filter('https://www.katholisch-in-duisdorf.de/kontakt/links/index.html', strict=True) is True


def test_validate():
Expand All @@ -74,6 +79,7 @@ def test_qelems():
assert normalize_url('http://test.net/foo.html?page=2&itemid=10&lang=en') == 'http://test.net/foo.html?itemid=10&lang=en&page=2'
with pytest.raises(ValueError):
assert normalize_url('http://test.net/foo.html?page=2&lang=en', with_language=True)
assert normalize_url('http://www.evolanguage.de/index.php?page=deutschkurse_fuer_aerzte&language=ES', with_language=True)


def test_urlcheck():
Expand All @@ -96,6 +102,8 @@ def test_urlcheck():
# recheck type and spam filters
assert check_url('http://example.org/code/oembed/') is None
assert check_url('http://cams.com/') is None
assert check_url('https://denkiterm.wordpress.com/impressum/', strict=True) is None
assert check_url('http://www.fischfutter-index.de/improvit-trocken-frostfutter-fur-fast-alle-fische/', strict=True) is not None


def test_cli():
Expand Down

0 comments on commit 4da1b15

Please sign in to comment.