From 05c6e201b057fba31a6351414661e30a2c4f97b2 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Wed, 31 May 2023 16:34:59 +0200
Subject: [PATCH] prepare release 0.9.3 (#38)

* prepare release 0.9.3

* correct history

* CLI: processes, not threads

* correct readme

* readme: change link to archive

* update changes
---
 HISTORY.md          | 10 ++++++++++
 README.rst          | 17 ++++++++++++++++-
 courlan/__init__.py |  2 +-
 courlan/cli.py      |  2 +-
 4 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 3ada96e1..6c43cebd 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,15 @@
 ## History / Changelog
 
+### 0.9.3
+
+- more efficient URL parsing (#33)
+- refined link extraction and link filters (#30, #36)
+- more efficient normalization (#32)
+- more efficient sampling strategy (#31, #35)
+- added meta function to clear LRU caches (#34)
+- added parallel option in command-line interface (#37, #39)
+- added ``get_unvisited_domains()`` method to ``UrlStore`` (#40)
+
 
 ### 0.9.2
 
diff --git a/README.rst b/README.rst
index d29bce84..b713184b 100644
--- a/README.rst
+++ b/README.rst
@@ -245,6 +245,18 @@ Basic URL validation only:
     (True, ParseResult(scheme='http', netloc='www.example.org', path='/', params='', query='', fragment=''))
 
 
+Troubleshooting
+~~~~~~~~~~~~~~~
+
+Courlan uses an internal cache to speed up URL parsing. It can be reset as follows:
+
+.. code-block:: python
+
+    >>> from courlan.meta import clear_caches
+    >>> clear_caches()
+
+
+
 UrlStore class
 ~~~~~~~~~~~~~~
 
@@ -264,6 +276,7 @@ The ``UrlStore`` class allow for storing and retrieving domain-classified URLs,
    - ``filter_unvisited_urls(urls)``: Take a list of URLs and return the currently unvisited ones.
    - ``find_known_urls(domain)``: Get all already known URLs for the given domain (ex. "https://example.org").
    - ``find_unvisited_urls(domain)``: Get all unvisited URLs for the given domain.
+   - ``get_unvisited_domains()``: Return all domains which have not been all visited.
    - ``reset()``: Re-initialize the URL store.
 - Crawling and downloads
    - ``get_url(domain)``: Retrieve a single URL and consider it to be visited (with corresponding timestamp).
@@ -310,6 +323,8 @@ I/O:
   -d DISCARDEDFILE, --discardedfile DISCARDEDFILE
                         name of file to store discarded URLs (optional)
   -v, --verbose         increase output verbosity
+  -p PARALLEL, --parallel PARALLEL
+                        number of parallel processes (not used for sampling)
 
 Filtering:
   Configure URL filters
@@ -336,7 +351,7 @@ License
 
 *coURLan* is distributed under the `GNU General Public License v3.0 <https://github.com/adbar/courlan/blob/master/LICENSE>`_. If you wish to redistribute this library but feel bounded by the license conditions please try interacting `at arms length <https://www.gnu.org/licenses/gpl-faq.html#GPLInProprietarySystem>`_, `multi-licensing <https://en.wikipedia.org/wiki/Multi-licensing>`_ with `compatible licenses <https://en.wikipedia.org/wiki/GNU_General_Public_License#Compatibility_and_multi-licensing>`_, or `contacting me <https://github.com/adbar/courlan#author>`_.
 
-See also `GPL and free software licensing: What's in it for business? <https://www.techrepublic.com/blog/cio-insights/gpl-and-free-software-licensing-whats-in-it-for-business/>`_
+See also `GPL and free software licensing: What's in it for business? <https://web.archive.org/web/20230127221311/https://www.techrepublic.com/article/gpl-and-free-software-licensing-whats-in-it-for-business/>`_
 
 
 
diff --git a/courlan/__init__.py b/courlan/__init__.py
index 61407337..932e8f91 100644
--- a/courlan/__init__.py
+++ b/courlan/__init__.py
@@ -8,7 +8,7 @@
 __author__ = "Adrien Barbaresi"
 __license__ = "GNU GPL v3+"
 __copyright__ = "Copyright 2020-2023, Adrien Barbaresi"
-__version__ = "0.9.2"
+__version__ = "0.9.3"
 
 
 # imports
diff --git a/courlan/cli.py b/courlan/cli.py
index 41bd87f8..17385419 100644
--- a/courlan/cli.py
+++ b/courlan/cli.py
@@ -47,7 +47,7 @@ def parse_args(args: Any) -> Any:
     group1.add_argument(
         "-p",
         "--parallel",
-        help="number of parallel threads (not used for sampling)",
+        help="number of parallel processes (not used for sampling)",
         type=int,
         default=4,
     )