prepare new version

adbar · May 25, 2021 · 53a8bac · 53a8bac
1 parent 4721260
commit 53a8bac
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 4 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,6 +1,14 @@
 ## History / Changelog
 
 
+### 0.4.0
+
+- URL manipulation tools added: extract parts, fix relative URLs
+- filters added: language, navigation and crawls
+- more robust link handling and extraction
+- removed support for Python 3.4
+
+
 ### 0.3.1
 
 - improve filter precision

diff --git a/README.rst b/README.rst
@@ -149,15 +149,22 @@ Determine if a link leads to another host:
     >>> is_external('https://google.com/', 'https://www.google.co.uk/', ignore_suffix=False)
     True
 
+
 Other useful functions:
 
-- ``lang_filter(url, language)``: heuristics concerning internationalization in URLs
 - ``fix_relative_urls()``: prepend necessary information to relative links
 - ``get_base_url()``: strip the URL of some of its parts
 - ``get_host_and_path()``: decompose URLs in two parts: protocol + host/domain and path
 - ``get_hostinfo()``: extract domain and host info (protocol + host/domain)
 
 
+Other filters:
+
+- ``is_not_crawlable(url)``: check for deep web or pages generally not usable in a crawling context
+- ``is_navigation_page(url)``: check for navigation and overview pages
+- ``lang_filter(url, language)``: heuristics concerning internationalization in URLs
+
+
 Command-line
 ~~~~~~~~~~~~
 

diff --git a/courlan/__init__.py b/courlan/__init__.py
@@ -14,5 +14,5 @@
 # imports
 from .clean import clean_url, normalize_url, scrub_url
 from .core import check_url, extract_links, sample_urls
-from .filters import lang_filter, validate_url
+from .filters import is_navigation_page, is_not_crawlable, lang_filter, validate_url
 from .urlutils import extract_domain, fix_relative_urls, get_base_url, get_host_and_path, get_hostinfo, is_external
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -19,9 +19,9 @@
 except ImportError:
     TLD_EXTRACTION = None
 
-from courlan import clean_url, normalize_url, scrub_url, check_url, is_external, sample_urls, validate_url, extract_links, extract_domain, fix_relative_urls, get_base_url, get_host_and_path, get_hostinfo
+from courlan import clean_url, normalize_url, scrub_url, check_url, is_external, sample_urls, validate_url, extract_links, extract_domain, fix_relative_urls, get_base_url, get_host_and_path, get_hostinfo, is_navigation_page, is_not_crawlable, lang_filter
 from courlan.cli import parse_args
-from courlan.filters import extension_filter, is_navigation_page, is_not_crawlable, lang_filter, spam_filter, type_filter
+from courlan.filters import extension_filter, spam_filter, type_filter
 
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)