From 194a08ce9d0dab3180e3fade8fac340ceb06c2c2 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Tue, 1 Sep 2020 19:19:05 +0200
Subject: [PATCH] prepare version release

---
 HISTORY.md          |  7 +++++++
 README.rst          | 12 ++++++++----
 setup.py            | 14 +++++++-------
 tests/unit_tests.py |  1 +
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 05d0554..bd4b7e9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,13 @@
 ## History / Changelog
 
 
+### 0.2.0
+
+- Cleaner and more efficient filtering
+- Helper functions to scrub, clean and normalize
+- Removed two dependencies with more extensive usage of urllib.parse
+
+
 ### 0.1.0
 
 - Cleaning and filtering targeting non-spam HTML pages with primarily text
diff --git a/README.rst b/README.rst
index 25e1b00..921ea45 100644
--- a/README.rst
+++ b/README.rst
@@ -1,5 +1,5 @@
-coURLan: clean, filter and sample URLs
-======================================
+coURLan: Clean, filter, normalize, and sample URLs
+==================================================
 
 
 .. image:: https://img.shields.io/pypi/v/courlan.svg
@@ -72,9 +72,10 @@ All operations chained:
 .. code-block:: python
 
     >>> from courlan.core import check_url
-    >>> check_url('https://github.com/adbar/courlan') # returns url and domain name
+    # returns url and domain name
+    >>> check_url('https://github.com/adbar/courlan')
     ('https://github.com/adbar/courlan', 'github.com')
-    # noisy query parameters are removed
+    # noisy query parameters can be removed
     >>> check_url('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True)
     ('https://httpbin.org/redirect-to', 'httpbin.org')
     # Check for redirects (HEAD request)
@@ -107,6 +108,9 @@ Basic normalization only:
     >>> my_url = normalize_url(urlparse(my_url))
     # passing URL strings directly also works
     >>> my_url = normalize_url(my_url)
+    # remove unnecessary components and re-order query elements
+    >>> normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True)
+    'http://test.net/foo.html?page=2&post=abc'
 
 
 Basic URL validation only:
diff --git a/setup.py b/setup.py
index bf37be9..1a4aa2f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
 """
-URL manipulation tools
+URL filter and manipulation tools
 http://github.com/adbar/courlan
 """
 
@@ -20,13 +20,12 @@ def readme():
 
 setup(
     name='courlan',
-    version='0.1.0',
-    description='Clean, filter and sample URLs',
+    version='0.2.0',
+    description='Clean, filter, normalize, and sample URLs',
     long_description=readme(),
     classifiers=[
         # As from http://pypi.python.org/pypi?%3Aaction=list_classifiers
-        'Development Status :: 2 - Pre-Alpha',
-        #'Development Status :: 3 - Alpha',
+        'Development Status :: 3 - Alpha',
         #'Development Status :: 4 - Beta',
         #'Development Status :: 5 - Production/Stable',
         #'Development Status :: 6 - Mature',
@@ -48,9 +47,10 @@ def readme():
         'Programming Language :: Python :: 3.8',
         'Topic :: Internet :: WWW/HTTP',
         'Topic :: Scientific/Engineering :: Information Analysis',
+        'Topic :: Text Processing :: Filters',
     ],
-    keywords=['urls', 'url-parsing', 'url-manipulation', 'preprocessing', 'validation'],
-    url='http://github.com/adbar/urltools',
+    keywords=['urls', 'url-parsing', 'url-manipulation', 'preprocessing', 'validation', 'webcrawling'],
+    url='http://github.com/adbar/courlan',
     author='Adrien Barbaresi',
     author_email='barbaresi@bbaw.de',
     license='GPLv3+',
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index a5c5965..50417c7 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -125,3 +125,4 @@ def test_examples():
     assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de'
     assert validate_url('http://1234') == (False, None)
     assert validate_url('http://www.example.org/')[0] is True
+    assert normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) == 'http://test.net/foo.html?page=2&post=abc'