From 4da1b159ee4ff3b327414a429db128df8b558b18 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Wed, 2 Sep 2020 14:00:46 +0200
Subject: [PATCH] reviewed filter sensitivity

---
 courlan/filters.py  | 6 +++---
 tests/unit_tests.py | 8 ++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/courlan/filters.py b/courlan/filters.py
index 0f7ac19..4755761 100644
--- a/courlan/filters.py
+++ b/courlan/filters.py
@@ -11,9 +11,9 @@
 from urllib.parse import urlparse
 
 
-WORDPRESS_FILTER = re.compile(r'/(?:tags?|schlagwort|category|cat|kategorie|kat|auth?or|page|seite|user|search|gallery|gallerie|labels|archives|uploads|modules|attachment)/', re.IGNORECASE)
+WORDPRESS_FILTER = re.compile(r'/(?:tags?|schlagwort|category|cat|kategorie|kat|auth?or|page|seite|user|search|gallery|gall?erie|labels|archives|uploads|modules|attachment)/', re.IGNORECASE)
 PARAM_FILTER = re.compile(r'\.(atom|json|css|xml|js|jpg|jpeg|png|gif|tiff|pdf|ogg|mp3|m4a|aac|avi|mp4|mov|webm|flv|ico|pls|zip|tar|gz|iso|swf)\b', re.IGNORECASE)  # , re.IGNORECASE (?=[&?])
-PATH_FILTER = re.compile(r'(impressum|index)(\.html)?', re.IGNORECASE)
+PATH_FILTER = re.compile(r'\.[a-z]{2,5}/(impressum|index)(\.html?|\.php)?$', re.IGNORECASE)
 ADULT_FILTER = re.compile(r'\b(?:adult|amateur|cams?|gangbang|incest|sexyeroti[ck]|sexcam|bild\-?kontakte)\b|\b(?:arsch|fick|porno?)|(?:cash|swinger)\b', re.IGNORECASE)
 
 
@@ -26,7 +26,7 @@ def basic_filter(url):
 
 def extension_filter(component):
     '''Filter based on file extension'''
-    if re.search(r'\.[a-z]{2,5}$', component) and not component.endswith(('.asp', '.cfm', '.cgi', '.htm', 'html', '.jsp', '.php', '.pl')):
+    if re.search(r'\.[a-z]{2,5}$', component) and not component.endswith(('.amp', '.asp', '.aspx', '.cfm', '.cgi', '.htm', 'html', '.jsp', '.php', '.pl')):
         return False
     return True
 
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 50417c7..7ba7cfb 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -32,6 +32,8 @@ def test_scrub():
 
 def test_extension_filter():
     assert extension_filter('http://www.example.org/test.js') is False
+    assert extension_filter('http://goodbasic.com/GirlInfo.aspx?Pseudo=MilfJanett') is True
+    assert extension_filter('https://www.familienrecht-allgaeu.de/de/vermoegensrecht.amp') is True
 
 
 def test_spam_filter():
@@ -48,6 +50,9 @@ def test_type_filter():
     assert type_filter('http://my-videos.com/') is False
     assert type_filter('http://www.example.org/index', strict=True) is False
     assert type_filter('http://www.example.org/index.html', strict=True) is False
+    assert type_filter('http://concordia-hagen.de/impressum.html', strict=True) is False
+    assert type_filter('http://parkkralle.de/detail/index/sArticle/2704', strict=True) is True
+    assert type_filter('https://www.katholisch-in-duisdorf.de/kontakt/links/index.html', strict=True) is True
 
 
 def test_validate():
@@ -74,6 +79,7 @@ def test_qelems():
     assert normalize_url('http://test.net/foo.html?page=2&itemid=10&lang=en') == 'http://test.net/foo.html?itemid=10&lang=en&page=2'
     with pytest.raises(ValueError):
         assert normalize_url('http://test.net/foo.html?page=2&lang=en', with_language=True)
+        assert normalize_url('http://www.evolanguage.de/index.php?page=deutschkurse_fuer_aerzte&amp;language=ES', with_language=True)
 
 
 def test_urlcheck():
@@ -96,6 +102,8 @@ def test_urlcheck():
     # recheck type and spam filters
     assert check_url('http://example.org/code/oembed/') is None
     assert check_url('http://cams.com/') is None
+    assert check_url('https://denkiterm.wordpress.com/impressum/', strict=True) is None
+    assert check_url('http://www.fischfutter-index.de/improvit-trocken-frostfutter-fur-fast-alle-fische/', strict=True) is not None
 
 
 def test_cli():