From 194a08ce9d0dab3180e3fade8fac340ceb06c2c2 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 1 Sep 2020 19:19:05 +0200 Subject: [PATCH] prepare version release --- HISTORY.md | 7 +++++++ README.rst | 12 ++++++++---- setup.py | 14 +++++++------- tests/unit_tests.py | 1 + 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 05d0554..bd4b7e9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,13 @@ ## History / Changelog +### 0.2.0 + +- Cleaner and more efficient filtering +- Helper functions to scrub, clean and normalize +- Removed two dependencies with more extensive usage of urllib.parse + + ### 0.1.0 - Cleaning and filtering targeting non-spam HTML pages with primarily text diff --git a/README.rst b/README.rst index 25e1b00..921ea45 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,5 @@ -coURLan: clean, filter and sample URLs -====================================== +coURLan: Clean, filter, normalize, and sample URLs +================================================== .. image:: https://img.shields.io/pypi/v/courlan.svg @@ -72,9 +72,10 @@ All operations chained: .. code-block:: python >>> from courlan.core import check_url - >>> check_url('https://github.com/adbar/courlan') # returns url and domain name + # returns url and domain name + >>> check_url('https://github.com/adbar/courlan') ('https://github.com/adbar/courlan', 'github.com') - # noisy query parameters are removed + # noisy query parameters can be removed >>> check_url('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True) ('https://httpbin.org/redirect-to', 'httpbin.org') # Check for redirects (HEAD request) @@ -107,6 +108,9 @@ Basic normalization only: >>> my_url = normalize_url(urlparse(my_url)) # passing URL strings directly also works >>> my_url = normalize_url(my_url) + # remove unnecessary components and re-order query elements + >>> normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) + 'http://test.net/foo.html?page=2&post=abc' Basic URL validation only: diff --git a/setup.py b/setup.py index bf37be9..1a4aa2f 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ """ -URL manipulation tools +URL filter and manipulation tools http://github.com/adbar/courlan """ @@ -20,13 +20,12 @@ def readme(): setup( name='courlan', - version='0.1.0', - description='Clean, filter and sample URLs', + version='0.2.0', + description='Clean, filter, normalize, and sample URLs', long_description=readme(), classifiers=[ # As from http://pypi.python.org/pypi?%3Aaction=list_classifiers - 'Development Status :: 2 - Pre-Alpha', - #'Development Status :: 3 - Alpha', + 'Development Status :: 3 - Alpha', #'Development Status :: 4 - Beta', #'Development Status :: 5 - Production/Stable', #'Development Status :: 6 - Mature', @@ -48,9 +47,10 @@ def readme(): 'Programming Language :: Python :: 3.8', 'Topic :: Internet :: WWW/HTTP', 'Topic :: Scientific/Engineering :: Information Analysis', + 'Topic :: Text Processing :: Filters', ], - keywords=['urls', 'url-parsing', 'url-manipulation', 'preprocessing', 'validation'], - url='http://github.com/adbar/urltools', + keywords=['urls', 'url-parsing', 'url-manipulation', 'preprocessing', 'validation', 'webcrawling'], + url='http://github.com/adbar/courlan', author='Adrien Barbaresi', author_email='barbaresi@bbaw.de', license='GPLv3+', diff --git a/tests/unit_tests.py b/tests/unit_tests.py index a5c5965..50417c7 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -125,3 +125,4 @@ def test_examples(): assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de' assert validate_url('http://1234') == (False, None) assert validate_url('http://www.example.org/')[0] is True + assert normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) == 'http://test.net/foo.html?page=2&post=abc'