diff --git a/HISTORY.md b/HISTORY.md index ba39103..92451b7 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,14 @@ ## History / Changelog +### 0.4.0 + +- URL manipulation tools added: extract parts, fix relative URLs +- filters added: language, navigation and crawls +- more robust link handling and extraction +- removed support for Python 3.4 + + ### 0.3.1 - improve filter precision diff --git a/README.rst b/README.rst index 98c7b45..c11ca53 100644 --- a/README.rst +++ b/README.rst @@ -149,15 +149,22 @@ Determine if a link leads to another host: >>> is_external('https://google.com/', 'https://www.google.co.uk/', ignore_suffix=False) True + Other useful functions: -- ``lang_filter(url, language)``: heuristics concerning internationalization in URLs - ``fix_relative_urls()``: prepend necessary information to relative links - ``get_base_url()``: strip the URL of some of its parts - ``get_host_and_path()``: decompose URLs in two parts: protocol + host/domain and path - ``get_hostinfo()``: extract domain and host info (protocol + host/domain) +Other filters: + +- ``is_not_crawlable(url)``: check for deep web or pages generally not usable in a crawling context +- ``is_navigation_page(url)``: check for navigation and overview pages +- ``lang_filter(url, language)``: heuristics concerning internationalization in URLs + + Command-line ~~~~~~~~~~~~ diff --git a/courlan/__init__.py b/courlan/__init__.py index 8130956..9359021 100644 --- a/courlan/__init__.py +++ b/courlan/__init__.py @@ -14,5 +14,5 @@ # imports from .clean import clean_url, normalize_url, scrub_url from .core import check_url, extract_links, sample_urls -from .filters import lang_filter, validate_url +from .filters import is_navigation_page, is_not_crawlable, lang_filter, validate_url from .urlutils import extract_domain, fix_relative_urls, get_base_url, get_host_and_path, get_hostinfo, is_external diff --git a/tests/unit_tests.py b/tests/unit_tests.py index fa3f913..407a16c 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -19,9 +19,9 @@ except ImportError: TLD_EXTRACTION = None -from courlan import clean_url, normalize_url, scrub_url, check_url, is_external, sample_urls, validate_url, extract_links, extract_domain, fix_relative_urls, get_base_url, get_host_and_path, get_hostinfo +from courlan import clean_url, normalize_url, scrub_url, check_url, is_external, sample_urls, validate_url, extract_links, extract_domain, fix_relative_urls, get_base_url, get_host_and_path, get_hostinfo, is_navigation_page, is_not_crawlable, lang_filter from courlan.cli import parse_args -from courlan.filters import extension_filter, is_navigation_page, is_not_crawlable, lang_filter, spam_filter, type_filter +from courlan.filters import extension_filter, spam_filter, type_filter logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)