From a144749e87da65af5f5d1296d5eba7e186294b9c Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Mon, 24 Apr 2023 18:04:33 +0200 Subject: [PATCH] prepare v0.9.1 (#23) * prepare v0.9.1 * Readme wording --- HISTORY.md | 8 ++++++++ README.rst | 8 ++++++++ courlan/__init__.py | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 9ab6b36..5bf57e1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,14 @@ ## History / Changelog +### 0.9.1 + +- network tests: larger throughput +- UrlStore: optional compression of rules (#21), added `reset()` (#22) and `get_all_counts()` methods +- UrlStore fixes: `signal` in #18, `total_url_number` +- updated Readme + + ### 0.9.0 - hardening of filters and URL parses (#14) diff --git a/README.rst b/README.rst index e55f986..d29bce8 100644 --- a/README.rst +++ b/README.rst @@ -255,6 +255,7 @@ The ``UrlStore`` class allow for storing and retrieving domain-classified URLs, - ``dump_urls()``: Return a list of all known URLs. - ``print_urls()``: Print all URLs in store (URL + TAB + visited or not). - ``print_unvisited_urls()``: Print all unvisited URLs in store. + - ``get_all_counts()``: Return all download counts for the hosts in store. - ``get_known_domains()``: Return all known domains as a list. - ``total_url_number()``: Find number of all URLs in store. - ``is_known(url)``: Check if the given URL has already been stored. @@ -263,6 +264,7 @@ The ``UrlStore`` class allow for storing and retrieving domain-classified URLs, - ``filter_unvisited_urls(urls)``: Take a list of URLs and return the currently unvisited ones. - ``find_known_urls(domain)``: Get all already known URLs for the given domain (ex. "https://example.org"). - ``find_unvisited_urls(domain)``: Get all unvisited URLs for the given domain. + - ``reset()``: Re-initialize the URL store. - Crawling and downloads - ``get_url(domain)``: Retrieve a single URL and consider it to be visited (with corresponding timestamp). - ``get_rules(domain)``: Return the stored crawling rules for the given website. @@ -273,6 +275,12 @@ The ``UrlStore`` class allow for storing and retrieving domain-classified URLs, - ``unvisited_websites_number()``: Return the number of websites for which there are still URLs to visit. - ``is_exhausted_domain(domain)``: Tell if all known URLs for the website have been visited. +Optional settings: +- ``compressed=True``: activate compression of URLs and rules +- ``language=XX``: focus on a particular target language (two-letter code) +- ``strict=True``: stricter URL filtering +- ``verbose=True``: dump URLs if interrupted (requires use of ``signal``) + Command-line ------------ diff --git a/courlan/__init__.py b/courlan/__init__.py index c0836e5..3199f47 100644 --- a/courlan/__init__.py +++ b/courlan/__init__.py @@ -8,7 +8,7 @@ __author__ = "Adrien Barbaresi" __license__ = "GNU GPL v3+" __copyright__ = "Copyright 2020-2023, Adrien Barbaresi" -__version__ = "0.9.0" +__version__ = "0.9.1" # imports