From 1cfb7db54598c37cefe4e5ddcc7aa36ba3035f33 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 1 Feb 2024 15:51:15 +0100 Subject: [PATCH] prepare version 1 (#84) --- HISTORY.md | 8 ++++++++ README.rst | 10 ++++++++++ courlan/__init__.py | 4 ++-- courlan/core.py | 1 + 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 5df7b83..7b10342 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,14 @@ ## History / Changelog +### 1.0.0 + +- license change from GPLv3+ to Apache 2.0 (#81) +- UrlStore: `write()` method and `load_store()` function added (#83) +- add parameter `trailing_slash` to keep of discard slashes at the end of URLs (#52) +- maintenance: fix whitespace in `clean_url()` (#77), simplify code (#79) + + ### 0.9.5 - IRI to URI normalization: encode path, query and fragments (#58, #60) diff --git a/README.rst b/README.rst index 82817d4..aa6a815 100644 --- a/README.rst +++ b/README.rst @@ -114,6 +114,12 @@ All useful operations chained in ``check_url(url)``: # check for redirects (HEAD request) >>> url, domain_name = check_url(my_url, with_redirects=True) + # include navigation pages instead of discarding them + >>> check_url('http://www.example.org/page/10/', with_nav=True) + + # remove trailing slash + >>> check_url('https://github.com/adbar/courlan/', trailing_slash=False) + Language-aware heuristics, notably internationalization in URLs, are available in ``lang_filter(url, language)``: @@ -311,6 +317,10 @@ The ``UrlStore`` class allow for storing and retrieving domain-classified URLs, - ``download_threshold_reached(threshold)``: Find out if the download limit (in seconds) has been reached for one of the websites in store. - ``unvisited_websites_number()``: Return the number of websites for which there are still URLs to visit. - ``is_exhausted_domain(domain)``: Tell if all known URLs for the website have been visited. +- Persistance + - ``write(filename)``: Save the store to disk. + - ``load_store(filename)``: Read a UrlStore from disk (separate function, not class method). + Optional settings: - ``compressed=True``: activate compression of URLs and rules diff --git a/courlan/__init__.py b/courlan/__init__.py index 2c2a426..f9f1f3d 100644 --- a/courlan/__init__.py +++ b/courlan/__init__.py @@ -6,8 +6,8 @@ __title__ = "courlan" __author__ = "Adrien Barbaresi" __license__ = "Apache-2.0" -__copyright__ = "Copyright 2020-2023, Adrien Barbaresi" -__version__ = "0.9.5" +__copyright__ = "Copyright 2020-2024, Adrien Barbaresi" +__version__ = "1.0.0" # imports diff --git a/courlan/core.py b/courlan/core.py index 025d2a6..a06e3eb 100644 --- a/courlan/core.py +++ b/courlan/core.py @@ -54,6 +54,7 @@ def check_url( with_redirects: set to True for redirection test (per HTTP HEAD request) language: set target language (ISO 639-1 codes) with_nav: set to True to include navigation pages instead of discarding them + trailing_slash: set to False to trim trailing slashes Returns: A tuple consisting of canonical URL and extracted domain