diff --git a/HISTORY.md b/HISTORY.md index d0f9a26..9ab6b36 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,16 @@ ## History / Changelog +### 0.9.0 + +- hardening of filters and URL parses (#14) +- normalize punicode to unicode +- methods added to `UrlStore`: `get_crawl_delay()`, `print_unvisited_urls()` +- `UrlStore` now triggers exit code 1 when interrupted +- argument added to `extract_links()`: `no_filter` +- code refactoring: simplifications + + ### 0.8.3 - fixed bug in domain name extraction diff --git a/README.rst b/README.rst index 67a52b6..e55f986 100644 --- a/README.rst +++ b/README.rst @@ -254,6 +254,7 @@ The ``UrlStore`` class allow for storing and retrieving domain-classified URLs, - ``add_urls(urls=[], appendleft=None, visited=False)``: Add a list of URLs to the (possibly) existing one. Optional: append certain URLs to the left, specify if the URLs have already been visited. - ``dump_urls()``: Return a list of all known URLs. - ``print_urls()``: Print all URLs in store (URL + TAB + visited or not). + - ``print_unvisited_urls()``: Print all unvisited URLs in store. - ``get_known_domains()``: Return all known domains as a list. - ``total_url_number()``: Find number of all URLs in store. - ``is_known(url)``: Check if the given URL has already been stored. @@ -265,6 +266,7 @@ The ``UrlStore`` class allow for storing and retrieving domain-classified URLs, - Crawling and downloads - ``get_url(domain)``: Retrieve a single URL and consider it to be visited (with corresponding timestamp). - ``get_rules(domain)``: Return the stored crawling rules for the given website. + - ``get_crawl_delay()``: Return the delay as extracted from robots.txt, or a given default. - ``get_download_urls(timelimit=10)``: Get a list of immediately downloadable URLs according to the given time limit per domain. - ``establish_download_schedule(max_urls=100, time_limit=10)``: Get up to the specified number of URLs along with a suitable backoff schedule (in seconds). - ``download_threshold_reached(threshold)``: Find out if the download limit (in seconds) has been reached for one of the websites in store. diff --git a/courlan/__init__.py b/courlan/__init__.py index ef6c125..c0836e5 100644 --- a/courlan/__init__.py +++ b/courlan/__init__.py @@ -7,8 +7,8 @@ __title__ = "courlan" __author__ = "Adrien Barbaresi" __license__ = "GNU GPL v3+" -__copyright__ = "Copyright 2020-2022, Adrien Barbaresi" -__version__ = "0.8.3" +__copyright__ = "Copyright 2020-2023, Adrien Barbaresi" +__version__ = "0.9.0" # imports diff --git a/courlan/clean.py b/courlan/clean.py index 2059c7b..abfe5e7 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -9,7 +9,7 @@ import re from typing import Optional, Union -from urllib.parse import parse_qs, urlencode, urlparse, ParseResult +from urllib.parse import parse_qs, urlencode, ParseResult from .filters import validate_url from .settings import ALLOWED_PARAMS, CONTROL_PARAMS, TARGET_LANG_DE, TARGET_LANG_EN diff --git a/courlan/langinfo.py b/courlan/langinfo.py index d08f796..93ae42b 100644 --- a/courlan/langinfo.py +++ b/courlan/langinfo.py @@ -2,8 +2,10 @@ Constants containing info about languages and countries. """ +from typing import Set -LANGUAGE_CODES = { + +LANGUAGE_CODES: Set[str] = { "aa", "ab", "ae", @@ -191,7 +193,7 @@ } -COUNTRY_CODES = { +COUNTRY_CODES: Set[str] = { "aw", "af", "ao", diff --git a/setup.py b/setup.py index 27f8dcb..b3efa61 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ def get_version(package): "Return package version as listed in `__version__` in `init.py`" - initfile = Path(package, '__init__.py').read_text() # Python >= 3.5 + initfile = Path(package, "__init__.py").read_text(encoding="utf-8") return re.search("__version__ = ['\"]([^'\"]+)['\"]", initfile)[1] @@ -38,6 +38,7 @@ def get_long_description(): "courlan/clean.py", "courlan/core.py", "courlan/filters.py", + "courlan/langinfo.py", "courlan/settings.py", "courlan/urlstore.py", "courlan/urlutils.py", @@ -105,7 +106,8 @@ def get_long_description(): python_requires=">=3.6", install_requires=[ "langcodes >= 3.3.0", - "tld >= 0.12.6", + "tld == 0.12.6; python_version < '3.7'", + "tld >= 0.13; python_version >= '3.7'", "urllib3 >= 1.26, < 2", ], # extras_require=extras,