functions revamped

adbar · Sep 1, 2020 · 45c68a3 · 45c68a3
1 parent e258885
commit 45c68a3
Show file tree

Hide file tree

Showing 6 changed files with 186 additions and 120 deletions.
diff --git a/README.rst b/README.rst
@@ -23,8 +23,10 @@ coURLan: clean, filter and sample URLs
 Features
 --------
 
-- Cleaning and filtering targeting non-spam HTML pages with primarily text
-- URL validation
+Separate `the wheat from the chaff <https://en.wiktionary.org/wiki/separate_the_wheat_from_the_chaff>`_ and optimize crawls by focusing on non-spam HTML pages containing primarily text.
+
+- URL validation and (basic) normalization
+- Filters targeting spam and unsuitable content-types
 - Sampling by domain name
 - Command-line interface (CLI) and Python tool
 
@@ -57,39 +59,9 @@ This Python package is tested on Linux, macOS and Windows systems, it is compati
 Usage
 -----
 
-Current focus is on German, for more see ``settings.py``. This can be overriden by `cloning the repository <https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository-from-github>`_ and `recompiling the package locally <https://packaging.python.org/tutorials/installing-packages/#installing-from-a-local-src-tree>`_.
-
-
-Command-line
-~~~~~~~~~~~~
-
-.. code-block:: bash
-
-    $ courlan --inputfile url-list.txt --outputfile cleaned-urls.txt
-    $ courlan --help
-
-
-usage: courlan [-h] -i INPUTFILE -o OUTPUTFILE [-v] [-l] [-r] [-s]
-               [--samplesize SAMPLESIZE] [--exclude-max EXCLUDE_MAX]
-               [--exclude-min EXCLUDE_MIN]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -i INPUTFILE, --inputfile INPUTFILE
-                        name of input file
-  -o OUTPUTFILE, --outputfile OUTPUTFILE
-                        name of input file
-  -v, --verbose         increase output verbosity
-  -l, --language        use language filter
-  -r, --redirects       check redirects
-  -s, --sample          use sampling
-  --samplesize SAMPLESIZE
-                        size of sample per domain
-  --exclude-max EXCLUDE_MAX
-                        exclude domains with more than n URLs
-  --exclude-min EXCLUDE_MIN
-                        exclude domains with less than n URLs
+``courlan`` is designed to work best on English, German and most frequent European languages.
 
+The current logic of detailed/strict URL filtering is on German, for more see ``settings.py``. This can be overriden by `cloning the repository <https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository-from-github>`_ and `recompiling the package locally <https://packaging.python.org/tutorials/installing-packages/#installing-from-a-local-src-tree>`_.
 
 
 Python
@@ -100,25 +72,52 @@ All operations chained:
 .. code-block:: python
 
     >>> from courlan.core import check_url
-    >>> url, domain_name = check_url(my_url)
+    >>> check_url('https://github.com/adbar/courlan') # returns url and domain name
+    ('https://github.com/adbar/courlan', 'github.com')
+    # noisy query parameters are removed
+    >>> check_url('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True)
+    ('https://httpbin.org/redirect-to', 'httpbin.org')
     # Check for redirects (HEAD request)
     >>> url, domain_name = check_url(my_url, with_redirects=True)
+    # optional argument targeting webpages in German: with_language=False
 
 
-Cleaning only:
+Helper function, scrub and normalize:
 
 .. code-block:: python
 
     >>> from courlan.clean import clean_url
-    >>> my_url = clean_url(my_url)
+    >>> clean_url('HTTPS://WWW.DWDS.DE:80/')
+    'https://www.dwds.de'
 
 
-URL validation:
+Basic scrubbing only:
+
+.. code-block:: python
+
+    >>> from courlan.clean import scrub_url
+
+
+Basic normalization only:
+
+.. code-block:: python
+
+    >>> from urllib.parse import urlparse
+    >>> from courlan.clean import normalize_url
+    >>> my_url = normalize_url(urlparse(my_url))
+    # passing URL strings directly also works
+    >>> my_url = normalize_url(my_url)
+
+
+Basic URL validation only:
 
 .. code-block:: python
 
     >>> from courlan.filters import validate_url
-    >>> result, parsed_url = validate_url(my_url)
+    >>> validate_url('http://1234')
+    (False, None)
+    >>> validate_url('http://www.example.org/')
+    (True, ParseResult(scheme='http', netloc='www.example.org', path='/', params='', query='', fragment=''))
 
 
 Sampling by domain name:
@@ -127,7 +126,55 @@ Sampling by domain name:
 
     >>> from courlan.core import sample_urls
     >>> my_sample = sample_urls(my_urls, 100)
-    # optional: exclude_min=None, exclude_max=None, verbose=False
+    # optional: exclude_min=None, exclude_max=None, strict=False, verbose=False
+
+
+
+Command-line
+~~~~~~~~~~~~
+
+.. code-block:: bash
+
+    $ courlan --inputfile url-list.txt --outputfile cleaned-urls.txt
+    $ courlan --help
+
+
+usage: courlan [-h] -i INPUTFILE -o OUTPUTFILE [-d DISCARDEDFILE] [-v]
+               [--strict] [-l] [-r] [--sample] [--samplesize SAMPLESIZE]
+               [--exclude-max EXCLUDE_MAX] [--exclude-min EXCLUDE_MIN]
+
+optional arguments:
+  -h, --help            show this help message and exit
+
+I/O:
+  Manage input and output
+
+  -i INPUTFILE, --inputfile INPUTFILE
+                        name of input file (required)
+  -o OUTPUTFILE, --outputfile OUTPUTFILE
+                        name of output file (required)
+  -d DISCARDEDFILE, --discardedfile DISCARDEDFILE
+                        name of file to store discarded URLs (optional)
+  -v, --verbose         increase output verbosity
+
+Filtering:
+  Configure URL filters
+
+  --strict              perform more restrictive tests
+  -l, --language        use language filter
+  -r, --redirects       check redirects
+
+Sampling:
+  Use sampling by host, configure sample size
+
+  --sample              use sampling
+  --samplesize SAMPLESIZE
+                        size of sample per domain
+  --exclude-max EXCLUDE_MAX
+                        exclude domains with more than n URLs
+  --exclude-min EXCLUDE_MIN
+                        exclude domains with less than n URLs
+
 
 
 Additional scripts

diff --git a/courlan/clean.py b/courlan/clean.py
@@ -15,14 +15,19 @@
 
 
 def clean_url(url):
+    '''Helper function: chained scrubbing and normalization'''
+    return normalize_url(scrub_url(url))
+
+
+def scrub_url(url):
     '''Strip unnecessary parts and make sure only one URL is considered'''
     # trim
     url = url.strip()
     # clean the input string
     url = url.replace('[ \t]+', '')
     # trailing slashes
     url = url.rstrip('/')
-    # CDATA             # <![CDATA[http://www.urbanlife.de/item/260-bmw-i8-hybrid-revolution-unter-den-sportwagen.html]]>
+    # <![CDATA[http://www.urbanlife.de/item/260-bmw-i8-hybrid-revolution-unter-den-sportwagen.html]]>
     if url.startswith('<![CDATA['): # re.match(r'<!\[CDATA\[', url):
         url = url.replace('<![CDATA[', '') # url = re.sub(r'^<!\[CDATA\[', '', url)
         url = url.replace(']]>', '') # url = re.sub(r'\]\]>$', '', url)
@@ -38,23 +43,24 @@ def clean_url(url):
             url = match.group(1)
             logging.debug('taking url: %s', url)
     # lower
-    url = url.lower()
+    # url = url.lower()
     return url
 
 
-def clean_query(parsed_url, with_language=False):
+def clean_query(parsed_url, strict=False, with_language=False):
     '''Strip unwanted query elements'''
     if len(parsed_url.query) > 0:
         qdict = parse_qs(parsed_url.query)
         newqdict = OrderedDict()
         for qelem in sorted(qdict.keys()):
             teststr = qelem.lower()
             # control param
-            if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
+            if strict is True and \
+               teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
                 continue
             # control language
-            if with_language is True and teststr in CONTROL_PARAMS and \
-               teststr.lower() not in TARGET_LANG:
+            if with_language is True and \
+               teststr in CONTROL_PARAMS and teststr not in TARGET_LANG:
                 logging.debug('bad lang: %s %s', qelem, qdict[qelem])
                 raise ValueError
             # insert
@@ -64,20 +70,20 @@ def clean_query(parsed_url, with_language=False):
     return parsed_url
 
 
-def normalize_url(parsed_url, with_language=False):
+def normalize_url(parsed_url, strict=False, with_language=False):
     '''Takes a URL string or a parsed URL and returns a (basically) normalized URL string'''
     if not isinstance(parsed_url, ParseResult):
         parsed_url = urlparse(parsed_url)
     # port
     if parsed_url.port is not None and parsed_url.port in (80, 443):
-        parsed_url = parsed_url._replace(port=None)
+        parsed_url = parsed_url._replace(netloc=re.sub(r'(?<=\w):(?:80|443)', '', parsed_url.netloc))
     # lowercase + remove fragments
     parsed_url = parsed_url._replace(
                  scheme=parsed_url.scheme.lower(),
                  netloc=parsed_url.netloc.lower(),
                  fragment=''
                  )
     # strip unwanted query elements
-    parsed_url = clean_query(parsed_url, with_language)
+    parsed_url = clean_query(parsed_url, strict, with_language)
     # rebuild
     return parsed_url.geturl()
diff --git a/courlan/cli.py b/courlan/cli.py
@@ -77,5 +77,6 @@ def main():
             for url in sample_urls(urllist, args.samplesize, exclude_min=args.exclude_min, exclude_max=args.exclude_max, strict=args.strict, verbose=args.verbose):
                 outputfh.write(url + '\n')
 
+
 if __name__ == '__main__':
     main()
diff --git a/courlan/core.py b/courlan/core.py
@@ -15,8 +15,8 @@
 
 import tldextract
 
-from .clean import clean_url, normalize_url
-from .filters import extensionfilter, spamfilter, typefilter, validate_url
+from .clean import normalize_url, scrub_url
+from .filters import basic_filter, extension_filter, spam_filter, type_filter, validate_url
 from .network import redirection_test
 from .settings import BLACKLIST
 
@@ -54,17 +54,23 @@ def check_url(url, strict=False, with_redirects=False, with_language=False):
     # use standard parsing library, validate and strip fragments, then normalize
     try:
         # length test
-        if not url.startswith('http') or len(url) >= 500 or len(url) < 10:
+        if basic_filter(url) is False:
             raise ValueError
 
         # clean
-        url = clean_url(url)
+        url = scrub_url(url)
+
+        # get potential redirect
+        if with_redirects is True:
+            url = redirection_test(url)
+            if url is None:
+                raise ValueError
 
         # spam
-        if spamfilter(url) is False:
+        if spam_filter(url) is False:
             raise ValueError
         # structural elements
-        if typefilter(url, strict) is False:
+        if type_filter(url, strict) is False:
             raise ValueError
 
         # split and validate
@@ -73,14 +79,14 @@ def check_url(url, strict=False, with_redirects=False, with_language=False):
             raise ValueError
 
         # content filter based on extensions
-        if extensionfilter(parsed_url.path) is False:
+        if extension_filter(parsed_url.path) is False:
             raise ValueError
 
         # normalize
-        url = normalize_url(parsed_url, with_language)
+        url = normalize_url(parsed_url, strict, with_language)
 
     # handle exceptions
-    except (AttributeError, UnicodeError, ValueError):
+    except (AttributeError, ValueError, UnicodeError):
         # LOGGER.debug('discarded URL: %s', url)
         return None
 
@@ -89,17 +95,6 @@ def check_url(url, strict=False, with_redirects=False, with_language=False):
     if domain is None:
         return None
 
-    ## URL probably OK
-    # get potential redirect
-    if with_redirects is True:
-        url2 = redirection_test(url)
-        if url2 is not None:
-            domain2 = extract_domain(url)
-            if domain2 is not None and domain2 != domain:
-                return (url2, domain2)
-        else:
-            return None
-
     return (url, domain)
 
 

diff --git a/courlan/filters.py b/courlan/filters.py
@@ -17,8 +17,34 @@
 ADULT_FILTER = re.compile(r'\b(?:adult|amateur|cams?|gangbang|incest|sexyeroti[ck]|sexcam|bild\-?kontakte)\b|\b(?:arsch|fick|porno?)|(?:cash|swinger)\b', re.IGNORECASE)
 
 
+def basic_filter(url):
+    '''Filter URLs based on basic formal characteristics'''
+    if not url.startswith('http') or len(url) >= 500 or len(url) < 10:
+        return False
+    return True
+
+
+def extension_filter(component):
+    '''Filter based on file extension'''
+    if re.search(r'\.[a-z]{2,5}$', component) and not component.endswith(('.asp', '.cfm', '.cgi', '.htm', 'html', '.jsp', '.php', '.pl')):
+        return False
+    return True
+
+
+def spam_filter(url):
+    '''Try to filter out spam and adult websites'''
+    # TODO: to improve!
+    #for exp in (''):
+    #    if exp in url:
+    #        return False
+    if ADULT_FILTER.search(url):
+    #  or re.search(r'\b(?:sex)\b', url): # live|xxx|sex|ass|orgasm|cams|
+        return False
+    # default
+    return True
 
-def typefilter(url, strict=False):
+
+def type_filter(url, strict=False):
     '''Make sure the target URL is from a suitable type (HTML page with primarily text)'''
     # directory
     #if url.endswith('/'):
@@ -50,26 +76,6 @@ def typefilter(url, strict=False):
     return True
 
 
-def extensionfilter(component):
-    '''Filter based on file extension'''
-    if re.search(r'\.[a-z]{2,5}$', component) and not component.endswith(('html', '.htm', '.asp', '.php', '.jsp', '.pl', '.cgi', '.cfm')):
-        return False
-    return True
-
-
-def spamfilter(url):
-    '''Try to filter out spam and adult websites'''
-    # TODO: to improve!
-    #for exp in (''):
-    #    if exp in url:
-    #        return False
-    if ADULT_FILTER.search(url):
-    #  or re.search(r'\b(?:sex)\b', url): # live|xxx|sex|ass|orgasm|cams|
-        return False
-    # default
-    return True
-
-
 def validate_url(url):
     '''Parse and validate the input'''
     try: