From 05c6e201b057fba31a6351414661e30a2c4f97b2 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 31 May 2023 16:34:59 +0200 Subject: [PATCH] prepare release 0.9.3 (#38) * prepare release 0.9.3 * correct history * CLI: processes, not threads * correct readme * readme: change link to archive * update changes --- HISTORY.md | 10 ++++++++++ README.rst | 17 ++++++++++++++++- courlan/__init__.py | 2 +- courlan/cli.py | 2 +- 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 3ada96e1..6c43cebd 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,15 @@ ## History / Changelog +### 0.9.3 + +- more efficient URL parsing (#33) +- refined link extraction and link filters (#30, #36) +- more efficient normalization (#32) +- more efficient sampling strategy (#31, #35) +- added meta function to clear LRU caches (#34) +- added parallel option in command-line interface (#37, #39) +- added ``get_unvisited_domains()`` method to ``UrlStore`` (#40) + ### 0.9.2 diff --git a/README.rst b/README.rst index d29bce84..b713184b 100644 --- a/README.rst +++ b/README.rst @@ -245,6 +245,18 @@ Basic URL validation only: (True, ParseResult(scheme='http', netloc='www.example.org', path='/', params='', query='', fragment='')) +Troubleshooting +~~~~~~~~~~~~~~~ + +Courlan uses an internal cache to speed up URL parsing. It can be reset as follows: + +.. code-block:: python + + >>> from courlan.meta import clear_caches + >>> clear_caches() + + + UrlStore class ~~~~~~~~~~~~~~ @@ -264,6 +276,7 @@ The ``UrlStore`` class allow for storing and retrieving domain-classified URLs, - ``filter_unvisited_urls(urls)``: Take a list of URLs and return the currently unvisited ones. - ``find_known_urls(domain)``: Get all already known URLs for the given domain (ex. "https://example.org"). - ``find_unvisited_urls(domain)``: Get all unvisited URLs for the given domain. + - ``get_unvisited_domains()``: Return all domains which have not been all visited. - ``reset()``: Re-initialize the URL store. - Crawling and downloads - ``get_url(domain)``: Retrieve a single URL and consider it to be visited (with corresponding timestamp). @@ -310,6 +323,8 @@ I/O: -d DISCARDEDFILE, --discardedfile DISCARDEDFILE name of file to store discarded URLs (optional) -v, --verbose increase output verbosity + -p PARALLEL, --parallel PARALLEL + number of parallel processes (not used for sampling) Filtering: Configure URL filters @@ -336,7 +351,7 @@ License *coURLan* is distributed under the `GNU General Public License v3.0 `_. If you wish to redistribute this library but feel bounded by the license conditions please try interacting `at arms length `_, `multi-licensing `_ with `compatible licenses `_, or `contacting me `_. -See also `GPL and free software licensing: What's in it for business? `_ +See also `GPL and free software licensing: What's in it for business? `_ diff --git a/courlan/__init__.py b/courlan/__init__.py index 61407337..932e8f91 100644 --- a/courlan/__init__.py +++ b/courlan/__init__.py @@ -8,7 +8,7 @@ __author__ = "Adrien Barbaresi" __license__ = "GNU GPL v3+" __copyright__ = "Copyright 2020-2023, Adrien Barbaresi" -__version__ = "0.9.2" +__version__ = "0.9.3" # imports diff --git a/courlan/cli.py b/courlan/cli.py index 41bd87f8..17385419 100644 --- a/courlan/cli.py +++ b/courlan/cli.py @@ -47,7 +47,7 @@ def parse_args(args: Any) -> Any: group1.add_argument( "-p", "--parallel", - help="number of parallel threads (not used for sampling)", + help="number of parallel processes (not used for sampling)", type=int, default=4, )