From 4da1b159ee4ff3b327414a429db128df8b558b18 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 2 Sep 2020 14:00:46 +0200 Subject: [PATCH] reviewed filter sensitivity --- courlan/filters.py | 6 +++--- tests/unit_tests.py | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/courlan/filters.py b/courlan/filters.py index 0f7ac19..4755761 100644 --- a/courlan/filters.py +++ b/courlan/filters.py @@ -11,9 +11,9 @@ from urllib.parse import urlparse -WORDPRESS_FILTER = re.compile(r'/(?:tags?|schlagwort|category|cat|kategorie|kat|auth?or|page|seite|user|search|gallery|gallerie|labels|archives|uploads|modules|attachment)/', re.IGNORECASE) +WORDPRESS_FILTER = re.compile(r'/(?:tags?|schlagwort|category|cat|kategorie|kat|auth?or|page|seite|user|search|gallery|gall?erie|labels|archives|uploads|modules|attachment)/', re.IGNORECASE) PARAM_FILTER = re.compile(r'\.(atom|json|css|xml|js|jpg|jpeg|png|gif|tiff|pdf|ogg|mp3|m4a|aac|avi|mp4|mov|webm|flv|ico|pls|zip|tar|gz|iso|swf)\b', re.IGNORECASE) # , re.IGNORECASE (?=[&?]) -PATH_FILTER = re.compile(r'(impressum|index)(\.html)?', re.IGNORECASE) +PATH_FILTER = re.compile(r'\.[a-z]{2,5}/(impressum|index)(\.html?|\.php)?$', re.IGNORECASE) ADULT_FILTER = re.compile(r'\b(?:adult|amateur|cams?|gangbang|incest|sexyeroti[ck]|sexcam|bild\-?kontakte)\b|\b(?:arsch|fick|porno?)|(?:cash|swinger)\b', re.IGNORECASE) @@ -26,7 +26,7 @@ def basic_filter(url): def extension_filter(component): '''Filter based on file extension''' - if re.search(r'\.[a-z]{2,5}$', component) and not component.endswith(('.asp', '.cfm', '.cgi', '.htm', 'html', '.jsp', '.php', '.pl')): + if re.search(r'\.[a-z]{2,5}$', component) and not component.endswith(('.amp', '.asp', '.aspx', '.cfm', '.cgi', '.htm', 'html', '.jsp', '.php', '.pl')): return False return True diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 50417c7..7ba7cfb 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -32,6 +32,8 @@ def test_scrub(): def test_extension_filter(): assert extension_filter('http://www.example.org/test.js') is False + assert extension_filter('http://goodbasic.com/GirlInfo.aspx?Pseudo=MilfJanett') is True + assert extension_filter('https://www.familienrecht-allgaeu.de/de/vermoegensrecht.amp') is True def test_spam_filter(): @@ -48,6 +50,9 @@ def test_type_filter(): assert type_filter('http://my-videos.com/') is False assert type_filter('http://www.example.org/index', strict=True) is False assert type_filter('http://www.example.org/index.html', strict=True) is False + assert type_filter('http://concordia-hagen.de/impressum.html', strict=True) is False + assert type_filter('http://parkkralle.de/detail/index/sArticle/2704', strict=True) is True + assert type_filter('https://www.katholisch-in-duisdorf.de/kontakt/links/index.html', strict=True) is True def test_validate(): @@ -74,6 +79,7 @@ def test_qelems(): assert normalize_url('http://test.net/foo.html?page=2&itemid=10&lang=en') == 'http://test.net/foo.html?itemid=10&lang=en&page=2' with pytest.raises(ValueError): assert normalize_url('http://test.net/foo.html?page=2&lang=en', with_language=True) + assert normalize_url('http://www.evolanguage.de/index.php?page=deutschkurse_fuer_aerzte&language=ES', with_language=True) def test_urlcheck(): @@ -96,6 +102,8 @@ def test_urlcheck(): # recheck type and spam filters assert check_url('http://example.org/code/oembed/') is None assert check_url('http://cams.com/') is None + assert check_url('https://denkiterm.wordpress.com/impressum/', strict=True) is None + assert check_url('http://www.fischfutter-index.de/improvit-trocken-frostfutter-fur-fast-alle-fische/', strict=True) is not None def test_cli():