From e4740323275aab3b48cd3277d7b474e29da627b5 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Wed, 31 Jul 2019 17:52:54 +0300 Subject: [PATCH 01/79] Add badge with download stats --- README.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.rst b/README.rst index d5b2c72..d632323 100644 --- a/README.rst +++ b/README.rst @@ -14,6 +14,10 @@ :target: https://badge.fury.io/py/ultimate-sitemap-parser :alt: PyPI package +.. image:: https://pepy.tech/badge/ultimate-sitemap-parser + :target: https://pepy.tech/project/ultimate-sitemap-parser + :alt: Download stats + Website sitemap parser for Python 3.5+. From ecb9fc1306e14f097bd4bb62e0e35a1682618fb0 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Wed, 31 Jul 2019 17:55:04 +0300 Subject: [PATCH 02/79] Normalize project name --- .idea/mediacloud-ultimate_sitemap_parser.iml | 2 +- .idea/misc.xml | 2 +- .idea/modules.xml | 2 +- .idea/runConfigurations/pytest_in_test_helpers_py.xml | 4 ++-- .idea/runConfigurations/pytest_in_test_tree_py.xml | 4 ++-- README.rst | 10 +++++----- setup.py | 4 ++-- tests/test_tree.py | 2 +- tests/web_client/test_requests_client.py | 4 ++-- usp/web_client/requests_client.py | 2 +- 10 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.idea/mediacloud-ultimate_sitemap_parser.iml b/.idea/mediacloud-ultimate_sitemap_parser.iml index f438c56..7c18129 100644 --- a/.idea/mediacloud-ultimate_sitemap_parser.iml +++ b/.idea/mediacloud-ultimate_sitemap_parser.iml @@ -2,7 +2,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index bc3579d..d0ccad8 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml index 34b846b..1640676 100644 --- a/.idea/modules.xml +++ b/.idea/modules.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/.idea/runConfigurations/pytest_in_test_helpers_py.xml b/.idea/runConfigurations/pytest_in_test_helpers_py.xml index 7cf5704..2ec4b42 100644 --- a/.idea/runConfigurations/pytest_in_test_helpers_py.xml +++ b/.idea/runConfigurations/pytest_in_test_helpers_py.xml @@ -1,9 +1,9 @@ - + \ No newline at end of file diff --git a/tests/test_tree.py b/tests/test_tree.py index 72d148b..0b42fa7 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -65,6 +65,7 @@ def fallback_to_404_not_found_matcher(request): text="

404 Not Found!

", ) + # noinspection DuplicatedCode def test_sitemap_tree_for_homepage(self): """Test sitemap_tree_for_homepage().""" @@ -594,6 +595,7 @@ def test_sitemap_tree_for_homepage_plain_text(self): assert SitemapPage(url='{}/news/bar.html'.format(self.TEST_BASE_URL)) in pages assert SitemapPage(url='{}/news/baz.html'.format(self.TEST_BASE_URL)) in pages + # noinspection DuplicatedCode def test_sitemap_tree_for_homepage_rss_atom(self): """Test sitemap_tree_for_homepage() with RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index 3af1552..9f92c23 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -685,7 +685,7 @@ class PagesRSSSitemapParser(AbstractXMLSitemapParser): class Page(object): """ - Simple data class for holding various properties for a single entry while parsing. + Data class for holding various properties for a single RSS while parsing. """ __slots__ = [ @@ -819,7 +819,7 @@ class PagesAtomSitemapParser(AbstractXMLSitemapParser): # FIXME merge with RSS parser class as there are too many similarities class Page(object): - """Simple data class for holding various properties for a single entry while parsing.""" + """Data class for holding various properties for a single Atom while parsing.""" __slots__ = [ 'link', From d3bdaae56be87c97ce2f3f845087f495f6439b44 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Tue, 17 Mar 2020 18:37:54 +0800 Subject: [PATCH 09/79] Add sitemap_news.xml to unpublished sitemap paths --- usp/tree.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/usp/tree.py b/usp/tree.py index 490b23e..930053c 100644 --- a/usp/tree.py +++ b/usp/tree.py @@ -22,6 +22,10 @@ 'sitemap', 'admin/config/search/xmlsitemap', 'sitemap/sitemap-index.xml', + 'sitemap_news.xml', + 'sitemap-news.xml', + 'sitemap_news.xml.gz', + 'sitemap-news.xml.gz', } """Paths which are not exposed in robots.txt but might still contain a sitemap.""" From 62dd39c7bdc2e5b955827e188e2803f13d118b7c Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Tue, 28 Apr 2020 03:44:21 +0800 Subject: [PATCH 10/79] Update PyCharm project --- .idea/mediacloud-ultimate-sitemap-parser.iml | 2 +- .idea/misc.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.idea/mediacloud-ultimate-sitemap-parser.iml b/.idea/mediacloud-ultimate-sitemap-parser.iml index 2962712..ccc9715 100644 --- a/.idea/mediacloud-ultimate-sitemap-parser.iml +++ b/.idea/mediacloud-ultimate-sitemap-parser.iml @@ -4,7 +4,7 @@ - +
diff --git a/.idea/misc.xml b/.idea/misc.xml index d0ccad8..6b7670f 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file From 3867b6e446c14050beb589989ed379670cf68c14 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Tue, 28 Apr 2020 03:44:35 +0800 Subject: [PATCH 11/79] Get rid of some warnings --- tests/test_tree.py | 17 ++++++++--------- usp/web_client/abstract_client.py | 14 +++++++------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/tests/test_tree.py b/tests/test_tree.py index 0b42fa7..df828ad 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -3,11 +3,10 @@ import textwrap from decimal import Decimal from email.utils import format_datetime -from http import HTTPStatus from unittest import TestCase -import dateutil import requests_mock +from dateutil.tz import tzoffset from tests.helpers import gzip from usp.log import create_logger @@ -43,7 +42,7 @@ class TestSitemapTree(TestCase): # Publication / "last modified" date TEST_DATE_DATETIME = datetime.datetime( year=2009, month=12, day=17, hour=12, minute=4, second=56, - tzinfo=dateutil.tz.tzoffset(None, 7200), + tzinfo=tzoffset(None, 7200), ) TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat() """Test string date formatted as ISO 8601 (for XML and Atom 0.3 / 1.0 sitemaps).""" @@ -59,8 +58,8 @@ def fallback_to_404_not_found_matcher(request): """Reply with "404 Not Found" to unmatched URLs instead of throwing NoMockAddress.""" return requests_mock.create_response( request, - status_code=HTTPStatus.NOT_FOUND.value, - reason=HTTPStatus.NOT_FOUND.phrase, + status_code=404, + reason='Not Found', headers={'Content-Type': 'text/html'}, text="

404 Not Found!

", ) @@ -276,8 +275,8 @@ def test_sitemap_tree_for_homepage(self): # Nonexistent sitemap m.get( self.TEST_BASE_URL + '/sitemap_news_missing.xml', - status_code=HTTPStatus.NOT_FOUND.value, - reason=HTTPStatus.NOT_FOUND.phrase, + status_code=404, + reason='Not Found', headers={'Content-Type': 'text/html'}, text="

404 Not Found!

", ) @@ -1177,8 +1176,8 @@ def test_sitemap_tree_for_homepage_no_robots_txt(self): # Nonexistent robots.txt m.get( self.TEST_BASE_URL + '/robots.txt', - status_code=HTTPStatus.NOT_FOUND.value, - reason=HTTPStatus.NOT_FOUND.phrase, + status_code=404, + reason='Not Found', headers={'Content-Type': 'text/html'}, text="

404 Not Found!

", ) diff --git a/usp/web_client/abstract_client.py b/usp/web_client/abstract_client.py index 505d7ea..bcb06ef 100644 --- a/usp/web_client/abstract_client.py +++ b/usp/web_client/abstract_client.py @@ -7,25 +7,25 @@ RETRYABLE_HTTP_STATUS_CODES = { # Some servers return "400 Bad Request" initially but upon retry start working again, no idea why - HTTPStatus.BAD_REQUEST.value, + int(HTTPStatus.BAD_REQUEST), # If we timed out requesting stuff, we can just try again - HTTPStatus.REQUEST_TIMEOUT.value, + int(HTTPStatus.REQUEST_TIMEOUT), # If we got rate limited, it makes sense to wait a bit - HTTPStatus.TOO_MANY_REQUESTS.value, + int(HTTPStatus.TOO_MANY_REQUESTS), # Server might be just fine on a subsequent attempt - HTTPStatus.INTERNAL_SERVER_ERROR.value, + int(HTTPStatus.INTERNAL_SERVER_ERROR), # Upstream might reappear on a retry - HTTPStatus.BAD_GATEWAY.value, + int(HTTPStatus.BAD_GATEWAY), # Service might become available again on a retry - HTTPStatus.SERVICE_UNAVAILABLE.value, + int(HTTPStatus.SERVICE_UNAVAILABLE), # Upstream might reappear on a retry - HTTPStatus.GATEWAY_TIMEOUT.value, + int(HTTPStatus.GATEWAY_TIMEOUT), # (unofficial) 509 Bandwidth Limit Exceeded (Apache Web Server/cPanel) 509, From 859a4ae4a72075bbd6d944ef39f9e1e26727d6da Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Tue, 8 Sep 2020 18:42:55 +0300 Subject: [PATCH 12/79] Update repo URLs --- README.rst | 8 ++++---- setup.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index b21f738..2e32d45 100644 --- a/README.rst +++ b/README.rst @@ -1,13 +1,13 @@ -.. image:: https://travis-ci.org/berkmancenter/mediacloud-ultimate-sitemap-parser.svg?branch=develop - :target: https://travis-ci.org/berkmancenter/mediacloud-ultimate-sitemap-parser +.. image:: https://travis-ci.org/mediacloud/ultimate-sitemap-parser.svg?branch=develop + :target: https://travis-ci.org/mediacloud/ultimate-sitemap-parser :alt: Build Status .. image:: https://readthedocs.org/projects/ultimate-sitemap-parser/badge/?version=latest :target: https://ultimate-sitemap-parser.readthedocs.io/en/latest/?badge=latest :alt: Documentation Status -.. image:: https://coveralls.io/repos/github/berkmancenter/mediacloud-ultimate-sitemap-parser/badge.svg?branch=develop - :target: https://coveralls.io/github/berkmancenter/mediacloud-ultimate-sitemap-parser?branch=develop +.. image:: https://coveralls.io/repos/github/mediacloud/ultimate-sitemap-parser/badge.svg?branch=develop + :target: https://coveralls.io/github/mediacloud/ultimate-sitemap-parser?branch=develop :alt: Coverage Status .. image:: https://badge.fury.io/py/ultimate-sitemap-parser.svg diff --git a/setup.py b/setup.py index 68db8f5..6ad773f 100755 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def __readme(): long_description=__readme(), author='Linas Valiukas, Hal Roberts, Media Cloud project', author_email='linas@media.mit.edu, hroberts@cyber.law.harvard.edu', - url='https://github.com/berkmancenter/mediacloud-ultimate-sitemap-parser', + url='https://github.com/mediacloud/ultimate-sitemap-parser', license='GPLv3+', keywords="sitemap sitemap-xml parser", packages=find_packages(exclude=['tests']), From 68d1ccdd573be16cdfadc7f071c088642182bff1 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Tue, 8 Sep 2020 23:07:31 +0300 Subject: [PATCH 13/79] Update URL to backend repo --- tests/test_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 3b0e164..880c01c 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -73,7 +73,7 @@ def test_is_http_url(): assert not is_http_url('ftp://ftp.freebsd.org/pub/FreeBSD/') assert is_http_url('http://cyber.law.harvard.edu/about') - assert is_http_url('https://github.com/berkmancenter/mediacloud') + assert is_http_url('https://github.com/mediacloud/backend') # URLs with port, HTTP auth, localhost assert is_http_url('https://username:password@domain.com:12345/path?query=string#fragment') From de136f543cf90eeff1956eae33506b56985dfe0e Mon Sep 17 00:00:00 2001 From: Dustin Oprea Date: Fri, 20 Nov 2020 01:06:35 -0500 Subject: [PATCH 14/79] log.py: Eliminate log configuration Logging should be configured at the application level, and, by doing it here, it makes this hard to control and also interferes with the configuration that *is* done at the application-level whereever this package is used. Fixes #23 --- usp/log.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/usp/log.py b/usp/log.py index a642a4c..2f2eae8 100644 --- a/usp/log.py +++ b/usp/log.py @@ -33,20 +33,6 @@ def __init__(self, name: str): """ self.__l = logging.getLogger(name) - if not self.__l.handlers: - formatter = logging.Formatter( - fmt='%(asctime)s %(levelname)s %(name)s [%(process)d/%(threadName)s]: %(message)s' - ) - - handler = logging.StreamHandler() - handler.setFormatter(formatter) - self.__l.addHandler(handler) - - self.__l.setLevel(self.__LEVELS[self.__DEFAULT_LEVEL]) - - # Don't propagate handler to root logger - # (http://stackoverflow.com/a/21127526/200603) - self.__l.propagate = False def error(self, message: str) -> None: """ From cf317c018f2286df5500eedac4668c9a71642aca Mon Sep 17 00:00:00 2001 From: Arthur Melin <3268661+ArthurMelin@users.noreply.github.com> Date: Tue, 10 May 2022 17:08:50 +0200 Subject: [PATCH 15/79] Fix incorrect lowercasing of robots.txt Sitemap URLs --- usp/fetch_parse.py | 1 - 1 file changed, 1 deletion(-) diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index 9f92c23..7f6d577 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -162,7 +162,6 @@ def sitemap(self) -> AbstractSitemap: for robots_txt_line in self._content.splitlines(): robots_txt_line = robots_txt_line.strip() # robots.txt is supposed to be case sensitive but who cares in these Node.js times? - robots_txt_line = robots_txt_line.lower() sitemap_match = re.search(r'^site-?map:\s*(.+?)$', robots_txt_line, flags=re.IGNORECASE) if sitemap_match: sitemap_url = sitemap_match.group(1) From dd48c330196e3f7a564fd11f261cbdf4f96aa4aa Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 9 Nov 2022 10:04:59 +0000 Subject: [PATCH 16/79] Add anaconda installation details to README --- README.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.rst b/README.rst index 2e32d45..cd85807 100644 --- a/README.rst +++ b/README.rst @@ -14,6 +14,10 @@ :target: https://badge.fury.io/py/ultimate-sitemap-parser :alt: PyPI package +.. image:: https://img.shields.io/conda/v/conda-forge/ultimate-sitemap-parser?color=brightgreen + :target: https://anaconda.org/conda-forge/ultimate-sitemap-parser + :alt: Conda + .. image:: https://pepy.tech/badge/ultimate-sitemap-parser :target: https://pepy.tech/project/ultimate-sitemap-parser :alt: Download stats @@ -51,6 +55,12 @@ Installation pip install ultimate-sitemap-parser +or using Anaconda: + +.. code:: sh + + conda install -c conda-forge ultimate-sitemap-parser + Usage ===== From f70de00ee7059bd22ee0c55b17563363004a874b Mon Sep 17 00:00:00 2001 From: "japherwocky@japhbookpro" Date: Thu, 22 Dec 2022 19:00:33 -0500 Subject: [PATCH 17/79] add optional argument to requests web client, to ignore SSL checking --- usp/web_client/requests_client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/usp/web_client/requests_client.py b/usp/web_client/requests_client.py index 962f9fa..30d9078 100644 --- a/usp/web_client/requests_client.py +++ b/usp/web_client/requests_client.py @@ -75,10 +75,11 @@ class RequestsWebClient(AbstractWebClient): '__proxies', ] - def __init__(self): + def __init__(self, verify=True): self.__max_response_data_length = None self.__timeout = self.__HTTP_REQUEST_TIMEOUT self.__proxies = {} + self.__verify = verify def set_timeout(self, timeout: int) -> None: """Set HTTP request timeout.""" @@ -109,7 +110,8 @@ def get(self, url: str) -> AbstractWebClientResponse: timeout=self.__timeout, stream=True, headers={'User-Agent': self.__USER_AGENT}, - proxies=self.__proxies + proxies=self.__proxies, + verify=self.__verify, ) except requests.exceptions.Timeout as ex: # Retryable timeouts From e5b00ec31bbaaccfb4ad67b99c85631bba18accb Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Wed, 17 May 2023 11:38:51 +0100 Subject: [PATCH 18/79] Fix test with newer urllib3 --- tests/web_client/test_requests_client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/web_client/test_requests_client.py b/tests/web_client/test_requests_client.py index 2a6d5eb..45be93d 100644 --- a/tests/web_client/test_requests_client.py +++ b/tests/web_client/test_requests_client.py @@ -1,3 +1,4 @@ +import re import socket from http import HTTPStatus from unittest import TestCase @@ -94,7 +95,9 @@ def test_get_nonexistent_domain(self): assert response assert isinstance(response, WebClientErrorResponse) assert response.retryable() is False - assert 'Failed to establish a new connection' in response.message() + assert re.search( + r'Failed to (establish a new connection|resolve)', + response.message()) is not None def test_get_timeout(self): sock = socket.socket() From 26966a2931e8103bb962894baeae595389d16980 Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Wed, 17 May 2023 11:45:32 +0100 Subject: [PATCH 19/79] Don't include InvalidSitemap objects in trees --- tests/test_tree.py | 9 +-------- usp/tree.py | 3 ++- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/test_tree.py b/tests/test_tree.py index df828ad..ba04e11 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -1184,14 +1184,7 @@ def test_sitemap_tree_for_homepage_no_robots_txt(self): expected_sitemap_tree = IndexWebsiteSitemap( url='{}/'.format(self.TEST_BASE_URL), - sub_sitemaps=[ - InvalidSitemap( - url='{}/robots.txt'.format(self.TEST_BASE_URL), - reason=( - 'Unable to fetch sitemap from {base_url}/robots.txt: 404 Not Found' - ).format(base_url=self.TEST_BASE_URL), - ) - ] + sub_sitemaps=[], ) actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) diff --git a/usp/tree.py b/usp/tree.py index 930053c..26431bb 100644 --- a/usp/tree.py +++ b/usp/tree.py @@ -55,7 +55,8 @@ def sitemap_tree_for_homepage(homepage_url: str, web_client: Optional[AbstractWe robots_txt_fetcher = SitemapFetcher(url=robots_txt_url, web_client=web_client, recursion_level=0) robots_txt_sitemap = robots_txt_fetcher.sitemap() - sitemaps.append(robots_txt_sitemap) + if not isinstance(robots_txt_sitemap, InvalidSitemap): + sitemaps.append(robots_txt_sitemap) sitemap_urls_found_in_robots_txt = set() if isinstance(robots_txt_sitemap, IndexRobotsTxtSitemap): From de44bd5a73c7f6de96ee99c7696a3fc11b03b0c5 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 16 Aug 2024 15:57:29 +0100 Subject: [PATCH 20/79] bump version --- usp/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usp/__about__.py b/usp/__about__.py index 25f376d..3aad052 100644 --- a/usp/__about__.py +++ b/usp/__about__.py @@ -1,3 +1,3 @@ """Package version.""" -__version__ = "0.5" +__version__ = "0.6" From a2c85cb327220bd220eedf99bfbdc9c901802891 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 16 Aug 2024 18:42:51 +0100 Subject: [PATCH 21/79] Migrate to Poetry --- poetry.lock | 302 +++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 34 ++++++ setup.cfg | 11 -- setup.py | 72 ------------ 4 files changed, 336 insertions(+), 83 deletions(-) create mode 100644 poetry.lock create mode 100644 pyproject.toml delete mode 100644 setup.cfg delete mode 100755 setup.py diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..a6735f5 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,302 @@ +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "certifi" +version = "2024.7.4" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.3.2" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, + {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, +] + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.2" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "idna" +version = "3.7" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.5" +files = [ + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, +] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "packaging" +version = "24.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, +] + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pytest" +version = "8.3.2" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, + {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "requests" +version = "2.32.3" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.8" +files = [ + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "requests-mock" +version = "1.12.1" +description = "Mock out responses from the requests package" +optional = false +python-versions = ">=3.5" +files = [ + {file = "requests-mock-1.12.1.tar.gz", hash = "sha256:e9e12e333b525156e82a3c852f22016b9158220d2f47454de9cae8a77d371401"}, + {file = "requests_mock-1.12.1-py2.py3-none-any.whl", hash = "sha256:b1e37054004cdd5e56c84454cc7df12b25f90f382159087f4b6915aaeef39563"}, +] + +[package.dependencies] +requests = ">=2.22,<3" + +[package.extras] +fixture = ["fixtures"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[[package]] +name = "urllib3" +version = "2.2.2" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.8" +files = [ + {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, + {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[metadata] +lock-version = "2.0" +python-versions = "^3.8" +content-hash = "3e6c60d21088d98a4f57502f74a39f73ec178566af217434962810ce79f82364" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..702e1b8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[tool.poetry] +name = "ultimate-sitemap-parser" +version = "0.6.0" +description = "Ult" +authors = [ + "Linas Valiukas ", + "Hal Roberts " +] +license = "GPL-3.0-or-later" +readme = "README.rst" +classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Information Technology', + 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', + 'Programming Language :: Python', + 'Operating System :: OS Independent', + 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', + 'Topic :: Text Processing :: Indexing', + 'Topic :: Text Processing :: Markup :: XML', +] + +[tool.poetry.dependencies] +python = "^3.8" +python-dateutil = ">=2.1,<3.0.0" +requests = ">=2.2.1" + +[tool.poetry.group.dev.dependencies] +requests-mock = ">=1.6.0,<2.0" +pytest = ">=2.8" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index bf1a1d6..0000000 --- a/setup.cfg +++ /dev/null @@ -1,11 +0,0 @@ -[wheel] -universal = 1 - -[check-manifest] -ignore = - .travis.yml - .gitignore - .idea - -[aliases] -test=pytest diff --git a/setup.py b/setup.py deleted file mode 100755 index 6ad773f..0000000 --- a/setup.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python - -from setuptools import setup, find_packages - -from usp.__about__ import __version__ - - -def __readme(): - with open('README.rst', mode='r', encoding='utf-8') as f: - return f.read() - - -tests_require = [ - - # Mock HTTP server - 'requests_mock>=1.6.0,<2.0', - - # Running tests - 'pytest>=2.8', - -] - -setup( - name='ultimate-sitemap-parser', - version=__version__, - description='Ultimate Sitemap Parser', - long_description=__readme(), - author='Linas Valiukas, Hal Roberts, Media Cloud project', - author_email='linas@media.mit.edu, hroberts@cyber.law.harvard.edu', - url='https://github.com/mediacloud/ultimate-sitemap-parser', - license='GPLv3+', - keywords="sitemap sitemap-xml parser", - packages=find_packages(exclude=['tests']), - zip_safe=True, - python_requires='>=3.5', - install_requires=[ - - # Parsing arbitrary dates (sitemap date format is standardized but some implementations take liberties) - 'python-dateutil>=2.1,<3.0.0', - - # Making HTTP requests - 'requests>=2.2.1', - - ], - setup_requires=[ - - # Running tests as part of setup.py - 'pytest-runner>=4.2,<5.0', - - ], - tests_require=tests_require, - extras_require={ - 'test': tests_require, - }, - classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Intended Audience :: Information Technology', - 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', - 'Programming Language :: Python', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: Implementation :: PyPy', - 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', - 'Topic :: Text Processing :: Indexing', - 'Topic :: Text Processing :: Markup :: XML', - ] -) From b90ab071f4243b0af11d96a6b480b5a51780e336 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 16 Aug 2024 18:43:10 +0100 Subject: [PATCH 22/79] Add full license text and notice --- LICENSE | 674 ++++++++++++++++++++++++++++++++++++++++++ LICENSE.txt => NOTICE | 2 +- 2 files changed, 675 insertions(+), 1 deletion(-) create mode 100644 LICENSE rename LICENSE.txt => NOTICE (87%) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e72bfdd --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. \ No newline at end of file diff --git a/LICENSE.txt b/NOTICE similarity index 87% rename from LICENSE.txt rename to NOTICE index 3b84f8a..e4fb39c 100644 --- a/LICENSE.txt +++ b/NOTICE @@ -1,4 +1,4 @@ -Copyright (C) 2018 Linas Valiukas, Hal Roberts, 2018 Media Cloud project +Copyright (C) 2018 Linas Valiukas, Hal Roberts, Media Cloud project This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by From 7cbd02390c778954933e1ceec805ec26d1e0f765 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 16 Aug 2024 18:43:58 +0100 Subject: [PATCH 23/79] Remove manifest file --- MANIFEST.in | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 6008d27..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,5 +0,0 @@ -include *.rst -include *.txt -include setup.* -recursive-include usp *.py -include MANIFEST.in From 88c8971a797737550699c0323aa7516cf81948fb Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 16 Aug 2024 18:44:59 +0100 Subject: [PATCH 24/79] Remove .idea dir --- .gitignore | 68 +------------------ .idea/encodings.xml | 4 -- .idea/inspectionProfiles/Project_Default.xml | 19 ------ .idea/mediacloud-ultimate-sitemap-parser.iml | 16 ----- .idea/misc.xml | 7 -- .idea/modules.xml | 8 --- .../pytest_in_test_helpers_py.xml | 18 ----- .../pytest_in_test_tree_py.xml | 18 ----- .idea/vcs.xml | 6 -- 9 files changed, 1 insertion(+), 163 deletions(-) delete mode 100644 .idea/encodings.xml delete mode 100644 .idea/inspectionProfiles/Project_Default.xml delete mode 100644 .idea/mediacloud-ultimate-sitemap-parser.iml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/runConfigurations/pytest_in_test_helpers_py.xml delete mode 100644 .idea/runConfigurations/pytest_in_test_tree_py.xml delete mode 100644 .idea/vcs.xml diff --git a/.gitignore b/.gitignore index bd222aa..5e5c966 100644 --- a/.gitignore +++ b/.gitignore @@ -114,70 +114,4 @@ dmypy.json # Pyre type checker .pyre/ -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Generated files -.idea/**/contentModel.xml - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/httpRequests - -# Android studio 3.1+ serialized cache file -.idea/caches/build_file_checksums.ser - +.idea/ \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml deleted file mode 100644 index 15a15b2..0000000 --- a/.idea/encodings.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index cd0be2b..0000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/mediacloud-ultimate-sitemap-parser.iml b/.idea/mediacloud-ultimate-sitemap-parser.iml deleted file mode 100644 index ccc9715..0000000 --- a/.idea/mediacloud-ultimate-sitemap-parser.iml +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 6b7670f..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 1640676..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/runConfigurations/pytest_in_test_helpers_py.xml b/.idea/runConfigurations/pytest_in_test_helpers_py.xml deleted file mode 100644 index 2ec4b42..0000000 --- a/.idea/runConfigurations/pytest_in_test_helpers_py.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - \ No newline at end of file diff --git a/.idea/runConfigurations/pytest_in_test_tree_py.xml b/.idea/runConfigurations/pytest_in_test_tree_py.xml deleted file mode 100644 index de85e07..0000000 --- a/.idea/runConfigurations/pytest_in_test_tree_py.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From 963d342aa73f65adf892746dbe94e5d5621f3c7d Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 16 Aug 2024 18:47:09 +0100 Subject: [PATCH 25/79] Reformat with Ruff --- Makefile | 11 + poetry.lock | 29 +- pyproject.toml | 14 + tests/helpers.py | 6 +- tests/test_helpers.py | 138 +++- tests/test_tree.py | 958 ++++++++++++----------- tests/web_client/test_requests_client.py | 57 +- usp/exceptions.py | 4 + usp/fetch_parse.py | 406 +++++----- usp/helpers.py | 81 +- usp/log.py | 16 +- usp/objects/page.py | 150 ++-- usp/objects/sitemap.py | 68 +- usp/tree.py | 56 +- usp/web_client/abstract_client.py | 33 +- usp/web_client/requests_client.py | 37 +- 16 files changed, 1145 insertions(+), 919 deletions(-) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..65444ba --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +.PHONY: test +test: + poetry run pytest + +.PHONY: lint +lint: + poetry run ruff check --fix + +.PHONY: format +format: + poetry run ruff format \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index a6735f5..8bc3be3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -257,6 +257,33 @@ requests = ">=2.22,<3" [package.extras] fixture = ["fixtures"] +[[package]] +name = "ruff" +version = "0.6.1" +description = "An extremely fast Python linter and code formatter, written in Rust." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.6.1-py3-none-linux_armv6l.whl", hash = "sha256:b4bb7de6a24169dc023f992718a9417380301b0c2da0fe85919f47264fb8add9"}, + {file = "ruff-0.6.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:45efaae53b360c81043e311cdec8a7696420b3d3e8935202c2846e7a97d4edae"}, + {file = "ruff-0.6.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:bc60c7d71b732c8fa73cf995efc0c836a2fd8b9810e115be8babb24ae87e0850"}, + {file = "ruff-0.6.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c7477c3b9da822e2db0b4e0b59e61b8a23e87886e727b327e7dcaf06213c5cf"}, + {file = "ruff-0.6.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a0af7ab3f86e3dc9f157a928e08e26c4b40707d0612b01cd577cc84b8905cc9"}, + {file = "ruff-0.6.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:392688dbb50fecf1bf7126731c90c11a9df1c3a4cdc3f481b53e851da5634fa5"}, + {file = "ruff-0.6.1-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5278d3e095ccc8c30430bcc9bc550f778790acc211865520f3041910a28d0024"}, + {file = "ruff-0.6.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fe6d5f65d6f276ee7a0fc50a0cecaccb362d30ef98a110f99cac1c7872df2f18"}, + {file = "ruff-0.6.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2e0dd11e2ae553ee5c92a81731d88a9883af8db7408db47fc81887c1f8b672e"}, + {file = "ruff-0.6.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d812615525a34ecfc07fd93f906ef5b93656be01dfae9a819e31caa6cfe758a1"}, + {file = "ruff-0.6.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:faaa4060f4064c3b7aaaa27328080c932fa142786f8142aff095b42b6a2eb631"}, + {file = "ruff-0.6.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:99d7ae0df47c62729d58765c593ea54c2546d5de213f2af2a19442d50a10cec9"}, + {file = "ruff-0.6.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9eb18dfd7b613eec000e3738b3f0e4398bf0153cb80bfa3e351b3c1c2f6d7b15"}, + {file = "ruff-0.6.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:c62bc04c6723a81e25e71715aa59489f15034d69bf641df88cb38bdc32fd1dbb"}, + {file = "ruff-0.6.1-py3-none-win32.whl", hash = "sha256:9fb4c4e8b83f19c9477a8745e56d2eeef07a7ff50b68a6998f7d9e2e3887bdc4"}, + {file = "ruff-0.6.1-py3-none-win_amd64.whl", hash = "sha256:c2ebfc8f51ef4aca05dad4552bbcf6fe8d1f75b2f6af546cc47cc1c1ca916b5b"}, + {file = "ruff-0.6.1-py3-none-win_arm64.whl", hash = "sha256:3bc81074971b0ffad1bd0c52284b22411f02a11a012082a76ac6da153536e014"}, + {file = "ruff-0.6.1.tar.gz", hash = "sha256:af3ffd8c6563acb8848d33cd19a69b9bfe943667f0419ca083f8ebe4224a3436"}, +] + [[package]] name = "six" version = "1.16.0" @@ -299,4 +326,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "3e6c60d21088d98a4f57502f74a39f73ec178566af217434962810ce79f82364" +content-hash = "521cedf7d4b94b7856f3f291ec2b9a594e71aaaf21ab207cbd020f91c250d633" diff --git a/pyproject.toml b/pyproject.toml index 702e1b8..ba8d508 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,21 @@ requests = ">=2.2.1" [tool.poetry.group.dev.dependencies] requests-mock = ">=1.6.0,<2.0" pytest = ">=2.8" +ruff = "^0.6.1" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[tool.ruff] +target-version = "py38" +extend-exclude = ["docs/*"] + +[tool.ruff.lint] +select = [ + "E4", + "E7", + "E9", + "F", + "UP" +] \ No newline at end of file diff --git a/tests/helpers.py b/tests/helpers.py index 075e654..7af1a25 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -10,15 +10,15 @@ def gzip(data: Union[str, bytes]) -> bytes: raise Exception("Data is None.") if isinstance(data, str): - data = data.encode('utf-8') + data = data.encode("utf-8") if not isinstance(data, bytes): - raise Exception("Data is not str or bytes: %s" % str(data)) + raise Exception(f"Data is not str or bytes: {str(data)}") try: gzipped_data = gzip_lib.compress(data, compresslevel=9) except Exception as ex: - raise Exception("Unable to gzip data: %s" % str(ex)) + raise Exception(f"Unable to gzip data: {str(ex)}") if gzipped_data is None: raise Exception("Gzipped data is None.") diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 880c01c..27ce468 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -2,7 +2,11 @@ import pytest -from usp.exceptions import StripURLToHomepageException, SitemapException, GunzipException +from usp.exceptions import ( + StripURLToHomepageException, + SitemapException, + GunzipException, +) from usp.helpers import ( html_unescape_strip, parse_iso8601_date, @@ -24,36 +28,71 @@ def test_parse_iso8601_date(): parse_iso8601_date(None) with pytest.raises(SitemapException): - parse_iso8601_date('') + parse_iso8601_date("") - assert parse_iso8601_date("1997-07-16") == datetime.datetime(year=1997, month=7, day=16) + assert parse_iso8601_date("1997-07-16") == datetime.datetime( + year=1997, month=7, day=16 + ) assert parse_iso8601_date("1997-07-16T19:20+01:00") == datetime.datetime( - year=1997, month=7, day=16, hour=19, minute=20, + year=1997, + month=7, + day=16, + hour=19, + minute=20, tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)), ) assert parse_iso8601_date("1997-07-16T19:20:30+01:00") == datetime.datetime( - year=1997, month=7, day=16, hour=19, minute=20, second=30, + year=1997, + month=7, + day=16, + hour=19, + minute=20, + second=30, tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)), ) assert parse_iso8601_date("1997-07-16T19:20:30.45+01:00") == datetime.datetime( - year=1997, month=7, day=16, hour=19, minute=20, second=30, microsecond=450000, + year=1997, + month=7, + day=16, + hour=19, + minute=20, + second=30, + microsecond=450000, tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)), ) # "Z" timezone instead of "+\d\d:\d\d" assert parse_iso8601_date("2018-01-12T21:57:27Z") == datetime.datetime( - year=2018, month=1, day=12, hour=21, minute=57, second=27, tzinfo=datetime.timezone.utc, + year=2018, + month=1, + day=12, + hour=21, + minute=57, + second=27, + tzinfo=datetime.timezone.utc, ) def test_parse_rfc2822_date(): assert parse_rfc2822_date("Tue, 10 Aug 2010 20:43:53 -0000") == datetime.datetime( - year=2010, month=8, day=10, hour=20, minute=43, second=53, microsecond=0, + year=2010, + month=8, + day=10, + hour=20, + minute=43, + second=53, + microsecond=0, tzinfo=datetime.timezone(datetime.timedelta(seconds=0)), ) assert parse_rfc2822_date("Thu, 17 Dec 2009 12:04:56 +0200") == datetime.datetime( - year=2009, month=12, day=17, hour=12, minute=4, second=56, microsecond=0, + year=2009, + month=12, + day=17, + hour=12, + minute=4, + second=56, + microsecond=0, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)), ) @@ -62,66 +101,83 @@ def test_parse_rfc2822_date(): def test_is_http_url(): # noinspection PyTypeChecker assert not is_http_url(None) - assert not is_http_url('') + assert not is_http_url("") - assert not is_http_url('abc') - assert not is_http_url('/abc') - assert not is_http_url('//abc') - assert not is_http_url('///abc') + assert not is_http_url("abc") + assert not is_http_url("/abc") + assert not is_http_url("//abc") + assert not is_http_url("///abc") - assert not is_http_url('gopher://gopher.floodgap.com/0/v2/vstat') - assert not is_http_url('ftp://ftp.freebsd.org/pub/FreeBSD/') + assert not is_http_url("gopher://gopher.floodgap.com/0/v2/vstat") + assert not is_http_url("ftp://ftp.freebsd.org/pub/FreeBSD/") - assert is_http_url('http://cyber.law.harvard.edu/about') - assert is_http_url('https://github.com/mediacloud/backend') + assert is_http_url("http://cyber.law.harvard.edu/about") + assert is_http_url("https://github.com/mediacloud/backend") # URLs with port, HTTP auth, localhost - assert is_http_url('https://username:password@domain.com:12345/path?query=string#fragment') - assert is_http_url('http://localhost:9998/feed') - assert is_http_url('http://127.0.0.1:12345/456789') - assert is_http_url('http://127.0.00000000.1:8899/tweet_url?id=47') + assert is_http_url( + "https://username:password@domain.com:12345/path?query=string#fragment" + ) + assert is_http_url("http://localhost:9998/feed") + assert is_http_url("http://127.0.0.1:12345/456789") + assert is_http_url("http://127.0.00000000.1:8899/tweet_url?id=47") # Travis URL - assert is_http_url('http://testing-gce-286b4005-b1ae-4b1a-a0d8-faf85e39ca92:37873/gv/tests.rss') + assert is_http_url( + "http://testing-gce-286b4005-b1ae-4b1a-a0d8-faf85e39ca92:37873/gv/tests.rss" + ) # URLs with mistakes fixable by fix_common_url_mistakes() assert not is_http_url( - 'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled' + "http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled" ) # UTF-8 in paths - assert is_http_url('http://www.example.com/šiaurė.html') + assert is_http_url("http://www.example.com/šiaurė.html") # IDN - assert is_http_url('http://www.šiaurė.lt/šiaurė.html') - assert is_http_url('http://www.xn--iaur-yva35b.lt/šiaurė.html') - assert is_http_url('http://.xn--iaur-yva35b.lt') is False # Invalid Punycode + assert is_http_url("http://www.šiaurė.lt/šiaurė.html") + assert is_http_url("http://www.xn--iaur-yva35b.lt/šiaurė.html") + assert is_http_url("http://.xn--iaur-yva35b.lt") is False # Invalid Punycode def test_strip_url_to_homepage(): - assert strip_url_to_homepage('http://www.cwi.nl:80/%7Eguido/Python.html') == 'http://www.cwi.nl:80/' + assert ( + strip_url_to_homepage("http://www.cwi.nl:80/%7Eguido/Python.html") + == "http://www.cwi.nl:80/" + ) # HTTP auth - assert strip_url_to_homepage( - 'http://username:password@www.cwi.nl/page.html' - ) == 'http://username:password@www.cwi.nl/' + assert ( + strip_url_to_homepage("http://username:password@www.cwi.nl/page.html") + == "http://username:password@www.cwi.nl/" + ) # UTF-8 in paths - assert strip_url_to_homepage('http://www.example.com/šiaurė.html') == 'http://www.example.com/' + assert ( + strip_url_to_homepage("http://www.example.com/šiaurė.html") + == "http://www.example.com/" + ) # IDN - assert strip_url_to_homepage('https://www.šiaurė.lt/šiaurė.html') == 'https://www.šiaurė.lt/' - assert strip_url_to_homepage('http://www.xn--iaur-yva35b.lt/šiaurė.html') == 'http://www.xn--iaur-yva35b.lt/' + assert ( + strip_url_to_homepage("https://www.šiaurė.lt/šiaurė.html") + == "https://www.šiaurė.lt/" + ) + assert ( + strip_url_to_homepage("http://www.xn--iaur-yva35b.lt/šiaurė.html") + == "http://www.xn--iaur-yva35b.lt/" + ) with pytest.raises(StripURLToHomepageException): # noinspection PyTypeChecker strip_url_to_homepage(None) with pytest.raises(StripURLToHomepageException): - strip_url_to_homepage('') + strip_url_to_homepage("") with pytest.raises(StripURLToHomepageException): - strip_url_to_homepage('not an URL') + strip_url_to_homepage("not an URL") def test_gunzip(): @@ -130,13 +186,13 @@ def test_gunzip(): gunzip(None) with pytest.raises(GunzipException): # noinspection PyTypeChecker - gunzip('') + gunzip("") with pytest.raises(GunzipException): # noinspection PyTypeChecker - gunzip(b'') + gunzip(b"") with pytest.raises(GunzipException): # noinspection PyTypeChecker - gunzip('foo') + gunzip("foo") with pytest.raises(GunzipException): # noinspection PyTypeChecker - gunzip(b'foo') + gunzip(b"foo") diff --git a/tests/test_tree.py b/tests/test_tree.py index ba04e11..d6cc205 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -37,11 +37,16 @@ class TestSitemapTree(TestCase): - TEST_BASE_URL = 'http://test_ultimate-sitemap-parser.com' # mocked by HTTPretty + TEST_BASE_URL = "http://test_ultimate-sitemap-parser.com" # mocked by HTTPretty # Publication / "last modified" date TEST_DATE_DATETIME = datetime.datetime( - year=2009, month=12, day=17, hour=12, minute=4, second=56, + year=2009, + month=12, + day=17, + hour=12, + minute=4, + second=56, tzinfo=tzoffset(None, 7200), ) TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat() @@ -50,8 +55,8 @@ class TestSitemapTree(TestCase): TEST_DATE_STR_RFC2822 = format_datetime(TEST_DATE_DATETIME) """Test string date formatted as RFC 2822 (for RSS 2.0 sitemaps).""" - TEST_PUBLICATION_NAME = 'Test publication' - TEST_PUBLICATION_LANGUAGE = 'en' + TEST_PUBLICATION_NAME = "Test publication" + TEST_PUBLICATION_LANGUAGE = "en" @staticmethod def fallback_to_404_not_found_matcher(request): @@ -59,8 +64,8 @@ def fallback_to_404_not_found_matcher(request): return requests_mock.create_response( request, status_code=404, - reason='Not Found', - headers={'Content-Type': 'text/html'}, + reason="Not Found", + headers={"Content-Type": "text/html"}, text="

404 Not Found!

", ) @@ -72,41 +77,44 @@ def test_sitemap_tree_for_homepage(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': 'text/plain'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" User-agent: * Disallow: /whatever - Sitemap: {base_url}/sitemap_pages.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_pages.xml # Intentionally spelled as "Site-map" as Google tolerates this: # https://github.com/google/robotstxt/blob/master/robots.cc#L703 - Site-map: {base_url}/sitemap_news_index_1.xml - """.format(base_url=self.TEST_BASE_URL)).strip(), + Site-map: {self.TEST_BASE_URL}/sitemap_news_index_1.xml + """ + ).strip(), ) # One sitemap for random static pages m.get( - self.TEST_BASE_URL + '/sitemap_pages.xml', - headers={'Content-Type': 'application/xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_pages.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" - {base_url}/about.html - {last_modified_date} + {self.TEST_BASE_URL}/about.html + {self.TEST_DATE_STR_ISO8601} monthly 0.8 - {base_url}/contact.html - {last_modified_date} + {self.TEST_BASE_URL}/contact.html + {self.TEST_DATE_STR_ISO8601} when we feel like it @@ -116,40 +124,44 @@ def test_sitemap_tree_for_homepage(self): - """.format(base_url=self.TEST_BASE_URL, last_modified_date=self.TEST_DATE_STR_ISO8601)).strip(), + """ + ).strip(), ) # Index sitemap pointing to sitemaps with stories m.get( - self.TEST_BASE_URL + '/sitemap_news_index_1.xml', - headers={'Content-Type': 'application/xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_news_index_1.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" - {base_url}/sitemap_news_1.xml - {last_modified} + {self.TEST_BASE_URL}/sitemap_news_1.xml + {self.TEST_DATE_STR_ISO8601} - {base_url}/sitemap_news_index_2.xml - {last_modified} + {self.TEST_BASE_URL}/sitemap_news_index_2.xml + {self.TEST_DATE_STR_ISO8601} - """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(), + """ + ).strip(), ) # First sitemap with actual stories m.get( - self.TEST_BASE_URL + '/sitemap_news_1.xml', - headers={'Content-Type': 'application/xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_news_1.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" - {base_url}/news/foo.html + {self.TEST_BASE_URL}/news/foo.html @@ -157,72 +169,71 @@ def test_sitemap_tree_for_homepage(self): + href="{self.TEST_BASE_URL}/news/foo.html?mobile=1" /> - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} Foo <foo> - {base_url}/news/bar.html + {self.TEST_BASE_URL}/news/bar.html + href="{self.TEST_BASE_URL}/news/bar.html?mobile=1" /> - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} Bar & bar - """.format( - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - )).strip(), + """ + ).strip(), ) # Another index sitemap pointing to a second sitemaps with stories m.get( - self.TEST_BASE_URL + '/sitemap_news_index_2.xml', - headers={'Content-Type': 'application/xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_news_index_2.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" - {base_url}/sitemap_news_2.xml - {last_modified} + {self.TEST_BASE_URL}/sitemap_news_2.xml + {self.TEST_DATE_STR_ISO8601} - {base_url}/sitemap_news_missing.xml - {last_modified} + {self.TEST_BASE_URL}/sitemap_news_missing.xml + {self.TEST_DATE_STR_ISO8601} - """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(), + """ + ).strip(), ) # Second sitemap with actual stories m.get( - self.TEST_BASE_URL + '/sitemap_news_2.xml', - headers={'Content-Type': 'application/xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_news_2.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" - {base_url}/news/bar.html + {self.TEST_BASE_URL}/news/bar.html + href="{self.TEST_BASE_URL}/news/bar.html?mobile=1#fragment_is_to_be_removed" /> - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} @@ -249,113 +260,106 @@ def test_sitemap_tree_for_homepage(self): - {base_url}/news/baz.html + {self.TEST_BASE_URL}/news/baz.html + href="{self.TEST_BASE_URL}/news/baz.html?mobile=1" /> - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} - """.format( - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - )).strip(), + """ + ).strip(), ) # Nonexistent sitemap m.get( - self.TEST_BASE_URL + '/sitemap_news_missing.xml', + self.TEST_BASE_URL + "/sitemap_news_missing.xml", status_code=404, - reason='Not Found', - headers={'Content-Type': 'text/html'}, + reason="Not Found", + headers={"Content-Type": "text/html"}, text="

404 Not Found!

", ) expected_sitemap_tree = IndexWebsiteSitemap( - url='{}/'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/", sub_sitemaps=[ IndexRobotsTxtSitemap( - url='{}/robots.txt'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/robots.txt", sub_sitemaps=[ PagesXMLSitemap( - url='{}/sitemap_pages.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_pages.xml", pages=[ SitemapPage( - url='{}/about.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/about.html", last_modified=self.TEST_DATE_DATETIME, news_story=None, change_frequency=SitemapPageChangeFrequency.MONTHLY, - priority=Decimal('0.8'), + priority=Decimal("0.8"), ), SitemapPage( - url='{}/contact.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/contact.html", last_modified=self.TEST_DATE_DATETIME, news_story=None, - # Invalid input -- should be reset to "always" change_frequency=SitemapPageChangeFrequency.ALWAYS, - # Invalid input -- should be reset to 0.5 (the default as per the spec) - priority=Decimal('0.5'), - - ) + priority=Decimal("0.5"), + ), ], ), IndexXMLSitemap( - url='{}/sitemap_news_index_1.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_news_index_1.xml", sub_sitemaps=[ PagesXMLSitemap( - url='{}/sitemap_news_1.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_news_1.xml", pages=[ SitemapPage( - url='{}/news/foo.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/news/foo.html", news_story=SitemapNewsStory( - title='Foo ', + title="Foo ", publish_date=self.TEST_DATE_DATETIME, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, ), ), SitemapPage( - url='{}/news/bar.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/news/bar.html", news_story=SitemapNewsStory( - title='Bar & bar', + title="Bar & bar", publish_date=self.TEST_DATE_DATETIME, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, ), ), - ] + ], ), IndexXMLSitemap( - url='{}/sitemap_news_index_2.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_news_index_2.xml", sub_sitemaps=[ PagesXMLSitemap( - url='{}/sitemap_news_2.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_news_2.xml", pages=[ SitemapPage( - url='{}/news/bar.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/news/bar.html", news_story=SitemapNewsStory( - title='Bar & bar', + title="Bar & bar", publish_date=self.TEST_DATE_DATETIME, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, ), ), SitemapPage( - url='{}/news/baz.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/news/baz.html", news_story=SitemapNewsStory( - title='Bąž', + title="Bąž", publish_date=self.TEST_DATE_DATETIME, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, @@ -364,11 +368,11 @@ def test_sitemap_tree_for_homepage(self): ], ), InvalidSitemap( - url='{}/sitemap_news_missing.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_news_missing.xml", reason=( - 'Unable to fetch sitemap from {base_url}/sitemap_news_missing.xml: ' - '404 Not Found' - ).format(base_url=self.TEST_BASE_URL), + f"Unable to fetch sitemap from {self.TEST_BASE_URL}/sitemap_news_missing.xml: " + "404 Not Found" + ), ), ], ), @@ -376,15 +380,17 @@ def test_sitemap_tree_for_homepage(self): ), ], ) - ] + ], ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) expected_lines = str(expected_sitemap_tree).split() actual_lines = str(actual_sitemap_tree).split() diff = difflib.ndiff(expected_lines, actual_lines) - diff_str = '\n'.join(diff) + diff_str = "\n".join(diff) assert expected_sitemap_tree == actual_sitemap_tree, diff_str @@ -397,113 +403,114 @@ def test_sitemap_tree_for_homepage_gzip(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': 'text/plain'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" User-agent: * Disallow: /whatever - Sitemap: {base_url}/sitemap_1.gz - Sitemap: {base_url}/sitemap_2.dat - Sitemap: {base_url}/sitemap_3.xml.gz - """.format(base_url=self.TEST_BASE_URL)).strip(), + Sitemap: {self.TEST_BASE_URL}/sitemap_1.gz + Sitemap: {self.TEST_BASE_URL}/sitemap_2.dat + Sitemap: {self.TEST_BASE_URL}/sitemap_3.xml.gz + """ + ).strip(), ) # Gzipped sitemap without correct HTTP header but with .gz extension m.get( - self.TEST_BASE_URL + '/sitemap_1.gz', - content=gzip(textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_1.gz", + content=gzip( + textwrap.dedent( + f""" - {base_url}/news/foo.html + {self.TEST_BASE_URL}/news/foo.html - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} Foo <foo> - """.format( - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - )).strip()), + """ + ).strip() + ), ) # Gzipped sitemap with correct HTTP header but without .gz extension m.get( - self.TEST_BASE_URL + '/sitemap_2.dat', - headers={'Content-Type': 'application/x-gzip'}, - content=gzip(textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_2.dat", + headers={"Content-Type": "application/x-gzip"}, + content=gzip( + textwrap.dedent( + f""" - {base_url}/news/bar.html + {self.TEST_BASE_URL}/news/bar.html - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} - """.format( - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - )).strip()), + """ + ).strip() + ), ) # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't m.get( - self.TEST_BASE_URL + '/sitemap_3.xml.gz', - headers={'Content-Type': 'application/x-gzip'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_3.xml.gz", + headers={"Content-Type": "application/x-gzip"}, + text=textwrap.dedent( + f""" - {base_url}/news/baz.html + {self.TEST_BASE_URL}/news/baz.html - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} - """.format( - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - )).strip(), + """ + ).strip(), ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) # Don't do an in-depth check, we just need to make sure that gunzip works assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 1 - assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) + assert isinstance( + actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap + ) # noinspection PyUnresolvedReferences assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3 @@ -529,52 +536,64 @@ def test_sitemap_tree_for_homepage_plain_text(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': 'text/plain'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" User-agent: * Disallow: /whatever - Sitemap: {base_url}/sitemap_1.txt - Sitemap: {base_url}/sitemap_2.txt.dat - """.format(base_url=self.TEST_BASE_URL)).strip(), + Sitemap: {self.TEST_BASE_URL}/sitemap_1.txt + Sitemap: {self.TEST_BASE_URL}/sitemap_2.txt.dat + """ + ).strip(), ) # Plain text uncompressed sitemap (no Content-Type header) m.get( - self.TEST_BASE_URL + '/sitemap_1.txt', - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_1.txt", + text=textwrap.dedent( + f""" - {base_url}/news/foo.html + {self.TEST_BASE_URL}/news/foo.html - {base_url}/news/bar.html + {self.TEST_BASE_URL}/news/bar.html Some other stuff which totally doesn't look like an URL - """.format(base_url=self.TEST_BASE_URL)).strip(), + """ + ).strip(), ) # Plain text compressed sitemap without .gz extension m.get( - self.TEST_BASE_URL + '/sitemap_2.txt.dat', - headers={'Content-Type': 'application/x-gzip'}, - content=gzip(textwrap.dedent(""" - {base_url}/news/bar.html - {base_url}/news/baz.html - """.format(base_url=self.TEST_BASE_URL)).strip()), + self.TEST_BASE_URL + "/sitemap_2.txt.dat", + headers={"Content-Type": "application/x-gzip"}, + content=gzip( + textwrap.dedent( + f""" + {self.TEST_BASE_URL}/news/bar.html + {self.TEST_BASE_URL}/news/baz.html + """ + ).strip() + ), ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 1 - assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) + assert isinstance( + actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap + ) # noinspection PyUnresolvedReferences assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2 @@ -590,9 +609,9 @@ def test_sitemap_tree_for_homepage_plain_text(self): pages = list(actual_sitemap_tree.all_pages()) assert len(pages) == 4 - assert SitemapPage(url='{}/news/foo.html'.format(self.TEST_BASE_URL)) in pages - assert SitemapPage(url='{}/news/bar.html'.format(self.TEST_BASE_URL)) in pages - assert SitemapPage(url='{}/news/baz.html'.format(self.TEST_BASE_URL)) in pages + assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/foo.html") in pages + assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/bar.html") in pages + assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/baz.html") in pages # noinspection DuplicatedCode def test_sitemap_tree_for_homepage_rss_atom(self): @@ -602,107 +621,114 @@ def test_sitemap_tree_for_homepage_rss_atom(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': 'text/plain'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" User-agent: * Disallow: /whatever - Sitemap: {base_url}/sitemap_rss.xml - Sitemap: {base_url}/sitemap_atom_0_3.xml - Sitemap: {base_url}/sitemap_atom_1_0.xml - """.format(base_url=self.TEST_BASE_URL)).strip(), + Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml + """ + ).strip(), ) # RSS 2.0 sitemap m.get( - self.TEST_BASE_URL + '/sitemap_rss.xml', - headers={'Content-Type': 'application/rss+xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_rss.xml", + headers={"Content-Type": "application/rss+xml"}, + text=textwrap.dedent( + f""" Test RSS 2.0 feed This is a test RSS 2.0 feed. - {base_url} - {pub_date} + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_RFC2822} Test RSS 2.0 story #1 This is a test RSS 2.0 story #1. - {base_url}/rss_story_1.html - {base_url}/rss_story_1.html - {pub_date} + {self.TEST_BASE_URL}/rss_story_1.html + {self.TEST_BASE_URL}/rss_story_1.html + {self.TEST_DATE_STR_RFC2822} Test RSS 2.0 story #2 This is a test RSS 2.0 story #2. - {base_url}/rss_story_2.html - {base_url}/rss_story_2.html - {pub_date} + {self.TEST_BASE_URL}/rss_story_2.html + {self.TEST_BASE_URL}/rss_story_2.html + {self.TEST_DATE_STR_RFC2822} - """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(), + """ + ).strip(), ) # Atom 0.3 sitemap m.get( - self.TEST_BASE_URL + '/sitemap_atom_0_3.xml', - headers={'Content-Type': 'application/atom+xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" Test Atom 0.3 feed - - {pub_date} + + {self.TEST_DATE_STR_ISO8601} Test Atom 0.3 story #1 - - {base_url}/atom_0_3_story_1.html - {pub_date} + + {self.TEST_BASE_URL}/atom_0_3_story_1.html + {self.TEST_DATE_STR_ISO8601} Test Atom 0.3 story #2 - - {base_url}/atom_0_3_story_2.html - {pub_date} + + {self.TEST_BASE_URL}/atom_0_3_story_2.html + {self.TEST_DATE_STR_ISO8601} - """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(), + """ + ).strip(), ) # Atom 1.0 sitemap m.get( - self.TEST_BASE_URL + '/sitemap_atom_1_0.xml', - headers={'Content-Type': 'application/atom+xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" Test Atom 1.0 feed This is a test Atom 1.0 feed. - - - {base_url} - {pub_date} + + + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_ISO8601} Test Atom 1.0 story #1 - - - - {base_url}/atom_1_0_story_1.html - {pub_date} + + + + {self.TEST_BASE_URL}/atom_1_0_story_1.html + {self.TEST_DATE_STR_ISO8601} This is test atom 1.0 story #1.
@@ -717,11 +743,11 @@ def test_sitemap_tree_for_homepage_rss_atom(self): Test Atom 1.0 story #2 - - - - {base_url}/atom_1_0_story_2.html - {pub_date} + + + + {self.TEST_BASE_URL}/atom_1_0_story_2.html + {self.TEST_DATE_STR_ISO8601} This is test atom 1.0 story #2.
@@ -735,83 +761,86 @@ def test_sitemap_tree_for_homepage_rss_atom(self): - """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(), + """ + ).strip(), ) expected_sitemap_tree = IndexWebsiteSitemap( - url='{}/'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/", sub_sitemaps=[ IndexRobotsTxtSitemap( - url='{}/robots.txt'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/robots.txt", sub_sitemaps=[ PagesRSSSitemap( - url='{}/sitemap_rss.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", pages=[ SitemapPage( - url='{}/rss_story_1.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/rss_story_1.html", news_story=SitemapNewsStory( - title='Test RSS 2.0 story #1', + title="Test RSS 2.0 story #1", publish_date=self.TEST_DATE_DATETIME, ), ), SitemapPage( - url='{}/rss_story_2.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/rss_story_2.html", news_story=SitemapNewsStory( - title='Test RSS 2.0 story #2', + title="Test RSS 2.0 story #2", publish_date=self.TEST_DATE_DATETIME, - ) - ) - ] + ), + ), + ], ), PagesAtomSitemap( - url='{}/sitemap_atom_0_3.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", pages=[ SitemapPage( - url='{}/atom_0_3_story_1.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/atom_0_3_story_1.html", news_story=SitemapNewsStory( - title='Test Atom 0.3 story #1', + title="Test Atom 0.3 story #1", publish_date=self.TEST_DATE_DATETIME, ), ), SitemapPage( - url='{}/atom_0_3_story_2.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/atom_0_3_story_2.html", news_story=SitemapNewsStory( - title='Test Atom 0.3 story #2', + title="Test Atom 0.3 story #2", publish_date=self.TEST_DATE_DATETIME, - ) - ) - ] + ), + ), + ], ), PagesAtomSitemap( - url='{}/sitemap_atom_1_0.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", pages=[ SitemapPage( - url='{}/atom_1_0_story_1.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/atom_1_0_story_1.html", news_story=SitemapNewsStory( - title='Test Atom 1.0 story #1', + title="Test Atom 1.0 story #1", publish_date=self.TEST_DATE_DATETIME, ), ), SitemapPage( - url='{}/atom_1_0_story_2.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/atom_1_0_story_2.html", news_story=SitemapNewsStory( - title='Test Atom 1.0 story #2', + title="Test Atom 1.0 story #2", publish_date=self.TEST_DATE_DATETIME, - ) - ) - ] + ), + ), + ], ), - ] + ], ) - ] + ], ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) expected_lines = str(expected_sitemap_tree).split() actual_lines = str(actual_sitemap_tree).split() diff = difflib.ndiff(expected_lines, actual_lines) - diff_str = '\n'.join(diff) + diff_str = "\n".join(diff) assert expected_sitemap_tree == actual_sitemap_tree, diff_str @@ -824,95 +853,105 @@ def test_sitemap_tree_for_homepage_rss_atom_empty(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': 'text/plain'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" User-agent: * Disallow: /whatever - Sitemap: {base_url}/sitemap_rss.xml - Sitemap: {base_url}/sitemap_atom_0_3.xml - Sitemap: {base_url}/sitemap_atom_1_0.xml - """.format(base_url=self.TEST_BASE_URL)).strip(), + Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml + """ + ).strip(), ) # RSS 2.0 sitemap m.get( - self.TEST_BASE_URL + '/sitemap_rss.xml', - headers={'Content-Type': 'application/rss+xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_rss.xml", + headers={"Content-Type": "application/rss+xml"}, + text=textwrap.dedent( + f""" Test RSS 2.0 feed This is a test RSS 2.0 feed. - {base_url} - {pub_date} + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_RFC2822} - """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(), + """ + ).strip(), ) # Atom 0.3 sitemap m.get( - self.TEST_BASE_URL + '/sitemap_atom_0_3.xml', - headers={'Content-Type': 'application/atom+xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" Test Atom 0.3 feed - - {pub_date} + + {self.TEST_DATE_STR_ISO8601} - """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(), + """ + ).strip(), ) # Atom 1.0 sitemap m.get( - self.TEST_BASE_URL + '/sitemap_atom_1_0.xml', - headers={'Content-Type': 'application/atom+xml'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" Test Atom 1.0 feed This is a test Atom 1.0 feed. - - - {base_url} - {pub_date} + + + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_ISO8601} - """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(), + """ + ).strip(), ) expected_sitemap_tree = IndexWebsiteSitemap( - url='{}/'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/", sub_sitemaps=[ IndexRobotsTxtSitemap( - url='{}/robots.txt'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/robots.txt", sub_sitemaps=[ PagesRSSSitemap( - url='{}/sitemap_rss.xml'.format(self.TEST_BASE_URL), - pages=[] + url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", + pages=[], ), PagesAtomSitemap( - url='{}/sitemap_atom_0_3.xml'.format(self.TEST_BASE_URL), - pages=[] + url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", + pages=[], ), PagesAtomSitemap( - url='{}/sitemap_atom_1_0.xml'.format(self.TEST_BASE_URL), - pages=[] + url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", + pages=[], ), - ] + ], ) - ] + ], ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) assert expected_sitemap_tree == actual_sitemap_tree @@ -930,73 +969,76 @@ def test_sitemap_tree_for_homepage_prematurely_ending_xml(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': 'text/plain'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" User-agent: * Disallow: /whatever - Sitemap: {base_url}/sitemap.xml - """.format(base_url=self.TEST_BASE_URL)).strip(), + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + """ + ).strip(), ) m.get( - self.TEST_BASE_URL + '/sitemap.xml', - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap.xml", + text=textwrap.dedent( + f""" - {base_url}/news/first.html + {self.TEST_BASE_URL}/news/first.html - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} First story - {base_url}/news/second.html + {self.TEST_BASE_URL}/news/second.html - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} Second story - {base_url}/news/third.html + {self.TEST_BASE_URL}/news/third.html - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {base_url}/news/public.html + {self.TEST_BASE_URL}/news/public.html - """.format( - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - )).strip(), + """ + ).strip(), ) # Private sitemap (to be discovered by trying out a few paths) m.get( - self.TEST_BASE_URL + '/sitemap_index.xml', - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap_index.xml", + text=textwrap.dedent( + f""" - {base_url}/news/private.html + {self.TEST_BASE_URL}/news/private.html - """.format( - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - )).strip(), + """ + ).strip(), ) expected_sitemap_tree = IndexWebsiteSitemap( - url='{}/'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/", sub_sitemaps=[ IndexRobotsTxtSitemap( - url='{}/robots.txt'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/robots.txt", sub_sitemaps=[ PagesXMLSitemap( - url='{}/sitemap_public.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_public.xml", pages=[ SitemapPage( - url='{}/news/public.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/news/public.html", ), ], ), ], ), PagesXMLSitemap( - url='{}/sitemap_index.xml'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/sitemap_index.xml", pages=[ SitemapPage( - url='{}/news/private.html'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/news/private.html", ), ], ), - ] + ], ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) assert expected_sitemap_tree == actual_sitemap_tree @@ -1135,30 +1179,34 @@ def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': ''}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": ""}, + text=textwrap.dedent( + """ User-agent: * Disallow: /whatever - """.format(base_url=self.TEST_BASE_URL)).strip(), + """.format() + ).strip(), ) expected_sitemap_tree = IndexWebsiteSitemap( - url='{}/'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/", sub_sitemaps=[ IndexRobotsTxtSitemap( - url='{}/robots.txt'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/robots.txt", sub_sitemaps=[], ) - ] + ], ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) assert expected_sitemap_tree == actual_sitemap_tree @@ -1169,25 +1217,27 @@ def test_sitemap_tree_for_homepage_no_robots_txt(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) # Nonexistent robots.txt m.get( - self.TEST_BASE_URL + '/robots.txt', + self.TEST_BASE_URL + "/robots.txt", status_code=404, - reason='Not Found', - headers={'Content-Type': 'text/html'}, + reason="Not Found", + headers={"Content-Type": "text/html"}, text="

404 Not Found!

", ) expected_sitemap_tree = IndexWebsiteSitemap( - url='{}/'.format(self.TEST_BASE_URL), + url=f"{self.TEST_BASE_URL}/", sub_sitemaps=[], ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) assert expected_sitemap_tree == actual_sitemap_tree @@ -1202,9 +1252,9 @@ def test_sitemap_tree_for_homepage_huge_sitemap(self): xmlns:xhtml="http://www.w3.org/1999/xhtml"> """ for x in range(page_count): - sitemap_xml += """ + sitemap_xml += f""" - {base_url}/news/page_{x}.html + {self.TEST_BASE_URL}/news/page_{x}.html @@ -1212,24 +1262,18 @@ def test_sitemap_tree_for_homepage_huge_sitemap(self): + href="{self.TEST_BASE_URL}/news/page_{x}.html?mobile=1" /> - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} Foo <foo> - """.format( - x=x, - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - ) + """ sitemap_xml += "
" @@ -1237,28 +1281,32 @@ def test_sitemap_tree_for_homepage_huge_sitemap(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': 'text/plain'}, - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" User-agent: * Disallow: /whatever - Sitemap: {base_url}/sitemap.xml.gz - """.format(base_url=self.TEST_BASE_URL)).strip(), + Sitemap: {self.TEST_BASE_URL}/sitemap.xml.gz + """ + ).strip(), ) m.get( - self.TEST_BASE_URL + '/sitemap.xml.gz', - headers={'Content-Type': 'application/x-gzip'}, + self.TEST_BASE_URL + "/sitemap.xml.gz", + headers={"Content-Type": "application/x-gzip"}, content=gzip(sitemap_xml), ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) assert len(list(actual_sitemap_tree.all_pages())) == page_count @@ -1269,104 +1317,104 @@ def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self): m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) robots_txt_body = "" robots_txt_body += "User-agent: *\n" # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL - robots_txt_body += " Sitemap:{base_url}/sitemap.xml ".format(base_url=self.TEST_BASE_URL) + robots_txt_body += f" Sitemap:{self.TEST_BASE_URL}/sitemap.xml " m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': 'text/plain'}, + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, text=robots_txt_body, ) m.get( - self.TEST_BASE_URL + '/sitemap.xml', - text=textwrap.dedent(""" + self.TEST_BASE_URL + "/sitemap.xml", + text=textwrap.dedent( + f""" - {base_url}/news/first.html + {self.TEST_BASE_URL}/news/first.html - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} First story - """.format( - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - )).strip(), + """ + ).strip(), ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) assert len(list(actual_sitemap_tree.all_pages())) == 1 def test_sitemap_tree_for_homepage_utf8_bom(self): """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap.""" - robots_txt_body = textwrap.dedent(""" + robots_txt_body = textwrap.dedent( + f""" User-agent: * Disallow: /whatever - Sitemap: {base_url}/sitemap.xml - """.format(base_url=self.TEST_BASE_URL)).strip() + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + """ + ).strip() - sitemap_xml_body = textwrap.dedent(""" + sitemap_xml_body = textwrap.dedent( + f""" - {base_url}/news/first.html + {self.TEST_BASE_URL}/news/first.html - {publication_name} - {publication_language} + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} - {publication_date} + {self.TEST_DATE_STR_ISO8601} First story - """.format( - base_url=self.TEST_BASE_URL, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - publication_date=self.TEST_DATE_STR_ISO8601, - )).strip() + """ + ).strip() - robots_txt_body_encoded = robots_txt_body.encode('utf-8-sig') - sitemap_xml_body_encoded = sitemap_xml_body.encode('utf-8-sig') + robots_txt_body_encoded = robots_txt_body.encode("utf-8-sig") + sitemap_xml_body_encoded = sitemap_xml_body.encode("utf-8-sig") with requests_mock.Mocker() as m: m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) m.get( - self.TEST_BASE_URL + '/', - text='This is a homepage.', + self.TEST_BASE_URL + "/", + text="This is a homepage.", ) m.get( - self.TEST_BASE_URL + '/robots.txt', - headers={'Content-Type': 'text/plain'}, + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, content=robots_txt_body_encoded, ) m.get( - self.TEST_BASE_URL + '/sitemap.xml', + self.TEST_BASE_URL + "/sitemap.xml", content=sitemap_xml_body_encoded, ) - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + actual_sitemap_tree = sitemap_tree_for_homepage( + homepage_url=self.TEST_BASE_URL + ) assert len(list(actual_sitemap_tree.all_pages())) == 1 diff --git a/tests/web_client/test_requests_client.py b/tests/web_client/test_requests_client.py index 45be93d..450545c 100644 --- a/tests/web_client/test_requests_client.py +++ b/tests/web_client/test_requests_client.py @@ -14,11 +14,11 @@ class TestRequestsClient(TestCase): - TEST_BASE_URL = 'http://test-ultimate-sitemap-parser.com' # mocked by HTTPretty - TEST_CONTENT_TYPE = 'text/html' + TEST_BASE_URL = "http://test-ultimate-sitemap-parser.com" # mocked by HTTPretty + TEST_CONTENT_TYPE = "text/html" __slots__ = [ - '__client', + "__client", ] def setUp(self) -> None: @@ -28,12 +28,12 @@ def setUp(self) -> None: def test_get(self): with requests_mock.Mocker() as m: - test_url = self.TEST_BASE_URL + '/' - test_content = 'This is a homepage.' + test_url = self.TEST_BASE_URL + "/" + test_content = "This is a homepage." m.get( test_url, - headers={'Content-Type': self.TEST_CONTENT_TYPE}, + headers={"Content-Type": self.TEST_CONTENT_TYPE}, text=test_content, ) @@ -43,18 +43,18 @@ def test_get(self): assert isinstance(response, AbstractWebClientSuccessResponse) assert response.status_code() == HTTPStatus.OK.value assert response.status_message() == HTTPStatus.OK.phrase - assert response.header('Content-Type') == self.TEST_CONTENT_TYPE - assert response.header('content-type') == self.TEST_CONTENT_TYPE - assert response.header('nonexistent') is None - assert response.raw_data().decode('utf-8') == test_content + assert response.header("Content-Type") == self.TEST_CONTENT_TYPE + assert response.header("content-type") == self.TEST_CONTENT_TYPE + assert response.header("nonexistent") is None + assert response.raw_data().decode("utf-8") == test_content def test_get_user_agent(self): with requests_mock.Mocker() as m: - test_url = self.TEST_BASE_URL + '/' + test_url = self.TEST_BASE_URL + "/" def content_user_agent(request, context): context.status_code = HTTPStatus.OK.value - return request.headers.get('User-Agent', 'unknown') + return request.headers.get("User-Agent", "unknown") m.get( test_url, @@ -66,19 +66,19 @@ def content_user_agent(request, context): assert response assert isinstance(response, AbstractWebClientSuccessResponse) - content = response.raw_data().decode('utf-8') - assert content == 'ultimate_sitemap_parser/{}'.format(__version__) + content = response.raw_data().decode("utf-8") + assert content == f"ultimate_sitemap_parser/{__version__}" def test_get_not_found(self): with requests_mock.Mocker() as m: - test_url = self.TEST_BASE_URL + '/404.html' + test_url = self.TEST_BASE_URL + "/404.html" m.get( test_url, status_code=HTTPStatus.NOT_FOUND.value, reason=HTTPStatus.NOT_FOUND.phrase, - headers={'Content-Type': self.TEST_CONTENT_TYPE}, - text='This page does not exist.', + headers={"Content-Type": self.TEST_CONTENT_TYPE}, + text="This page does not exist.", ) response = self.__client.get(test_url) @@ -88,26 +88,29 @@ def test_get_not_found(self): assert response.retryable() is False def test_get_nonexistent_domain(self): - test_url = 'http://www.totallydoesnotexisthjkfsdhkfsd.com/some_page.html' + test_url = "http://www.totallydoesnotexisthjkfsdhkfsd.com/some_page.html" response = self.__client.get(test_url) assert response assert isinstance(response, WebClientErrorResponse) assert response.retryable() is False - assert re.search( - r'Failed to (establish a new connection|resolve)', - response.message()) is not None + assert ( + re.search( + r"Failed to (establish a new connection|resolve)", response.message() + ) + is not None + ) def test_get_timeout(self): sock = socket.socket() - sock.bind(('', 0)) + sock.bind(("", 0)) socket_port = sock.getsockname()[1] assert socket_port sock.listen(1) test_timeout = 1 - test_url = 'http://127.0.0.1:{}/slow_page.html'.format(socket_port) + test_url = f"http://127.0.0.1:{socket_port}/slow_page.html" self.__client.set_timeout(test_timeout) @@ -118,19 +121,19 @@ def test_get_timeout(self): assert response assert isinstance(response, WebClientErrorResponse) assert response.retryable() is True - assert 'Read timed out' in response.message() + assert "Read timed out" in response.message() def test_get_max_response_data_length(self): with requests_mock.Mocker() as m: actual_length = 1024 * 1024 max_length = 1024 * 512 - test_url = self.TEST_BASE_URL + '/huge_page.html' - test_content = 'a' * actual_length + test_url = self.TEST_BASE_URL + "/huge_page.html" + test_content = "a" * actual_length m.get( test_url, - headers={'Content-Type': self.TEST_CONTENT_TYPE}, + headers={"Content-Type": self.TEST_CONTENT_TYPE}, text=test_content, ) diff --git a/usp/exceptions.py b/usp/exceptions.py index 7ad7332..88546cf 100644 --- a/usp/exceptions.py +++ b/usp/exceptions.py @@ -5,6 +5,7 @@ class SitemapException(Exception): """ Problem due to which we can't run further, e.g. wrong input parameters. """ + pass @@ -12,6 +13,7 @@ class SitemapXMLParsingException(Exception): """ XML parsing exception to be handled gracefully. """ + pass @@ -19,6 +21,7 @@ class GunzipException(Exception): """ gunzip() exception. """ + pass @@ -26,4 +29,5 @@ class StripURLToHomepageException(Exception): """ strip_url_to_homepage() exception. """ + pass diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index 7f6d577..e39ab1a 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -43,7 +43,7 @@ log = create_logger(__name__) -class SitemapFetcher(object): +class SitemapFetcher: """robots.txt / XML / plain text sitemap fetcher.""" __MAX_SITEMAP_SIZE = 100 * 1024 * 1024 @@ -55,18 +55,24 @@ class SitemapFetcher(object): """Max. recursion level in iterating over sub-sitemaps.""" __slots__ = [ - '_url', - '_recursion_level', - '_web_client', + "_url", + "_recursion_level", + "_web_client", ] - def __init__(self, url: str, recursion_level: int, web_client: Optional[AbstractWebClient] = None): - + def __init__( + self, + url: str, + recursion_level: int, + web_client: Optional[AbstractWebClient] = None, + ): if recursion_level > self.__MAX_RECURSION_LEVEL: - raise SitemapException("Recursion level exceeded {} for URL {}.".format(self.__MAX_RECURSION_LEVEL, url)) + raise SitemapException( + f"Recursion level exceeded {self.__MAX_RECURSION_LEVEL} for URL {url}." + ) if not is_http_url(url): - raise SitemapException("URL {} is not a HTTP(s) URL.".format(url)) + raise SitemapException(f"URL {url} is not a HTTP(s) URL.") if not web_client: web_client = RequestsWebClient() @@ -78,13 +84,15 @@ def __init__(self, url: str, recursion_level: int, web_client: Optional[Abstract self._recursion_level = recursion_level def sitemap(self) -> AbstractSitemap: - log.info("Fetching level {} sitemap from {}...".format(self._recursion_level, self._url)) - response = get_url_retry_on_client_errors(url=self._url, web_client=self._web_client) + log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...") + response = get_url_retry_on_client_errors( + url=self._url, web_client=self._web_client + ) if isinstance(response, WebClientErrorResponse): return InvalidSitemap( url=self._url, - reason="Unable to fetch sitemap from {}: {}".format(self._url, response.message()), + reason=f"Unable to fetch sitemap from {self._url}: {response.message()}", ) assert isinstance(response, AbstractWebClientSuccessResponse) @@ -92,7 +100,7 @@ def sitemap(self) -> AbstractSitemap: response_content = ungzipped_response_content(url=self._url, response=response) # MIME types returned in Content-Type are unpredictable, so peek into the content instead - if response_content[:20].strip().startswith('<'): + if response_content[:20].strip().startswith("<"): # XML sitemap (the specific kind is to be determined later) parser = XMLSitemapParser( url=self._url, @@ -103,7 +111,7 @@ def sitemap(self) -> AbstractSitemap: else: # Assume that it's some sort of a text file (robots.txt or plain text sitemap) - if self._url.endswith('/robots.txt'): + if self._url.endswith("/robots.txt"): parser = IndexRobotsTxtSitemapParser( url=self._url, content=response_content, @@ -118,23 +126,29 @@ def sitemap(self) -> AbstractSitemap: web_client=self._web_client, ) - log.info("Parsing sitemap from URL {}...".format(self._url)) + log.info(f"Parsing sitemap from URL {self._url}...") sitemap = parser.sitemap() return sitemap -class AbstractSitemapParser(object, metaclass=abc.ABCMeta): +class AbstractSitemapParser(metaclass=abc.ABCMeta): """Abstract robots.txt / XML / plain text sitemap parser.""" __slots__ = [ - '_url', - '_content', - '_web_client', - '_recursion_level', + "_url", + "_content", + "_web_client", + "_recursion_level", ] - def __init__(self, url: str, content: str, recursion_level: int, web_client: AbstractWebClient): + def __init__( + self, + url: str, + content: str, + recursion_level: int, + web_client: AbstractWebClient, + ): self._url = url self._content = content self._recursion_level = recursion_level @@ -148,27 +162,43 @@ def sitemap(self) -> AbstractSitemap: class IndexRobotsTxtSitemapParser(AbstractSitemapParser): """robots.txt index sitemap parser.""" - def __init__(self, url: str, content: str, recursion_level: int, web_client: AbstractWebClient): - super().__init__(url=url, content=content, recursion_level=recursion_level, web_client=web_client) - - if not self._url.endswith('/robots.txt'): - raise SitemapException("URL does not look like robots.txt URL: {}".format(self._url)) + def __init__( + self, + url: str, + content: str, + recursion_level: int, + web_client: AbstractWebClient, + ): + super().__init__( + url=url, + content=content, + recursion_level=recursion_level, + web_client=web_client, + ) + + if not self._url.endswith("/robots.txt"): + raise SitemapException( + f"URL does not look like robots.txt URL: {self._url}" + ) def sitemap(self) -> AbstractSitemap: - # Serves as an ordered set because we want to deduplicate URLs but also retain the order sitemap_urls = OrderedDict() for robots_txt_line in self._content.splitlines(): robots_txt_line = robots_txt_line.strip() # robots.txt is supposed to be case sensitive but who cares in these Node.js times? - sitemap_match = re.search(r'^site-?map:\s*(.+?)$', robots_txt_line, flags=re.IGNORECASE) + sitemap_match = re.search( + r"^site-?map:\s*(.+?)$", robots_txt_line, flags=re.IGNORECASE + ) if sitemap_match: sitemap_url = sitemap_match.group(1) if is_http_url(sitemap_url): sitemap_urls[sitemap_url] = True else: - log.warning("Sitemap URL {} doesn't look like an URL, skipping".format(sitemap_url)) + log.warning( + f"Sitemap URL {sitemap_url} doesn't look like an URL, skipping" + ) sub_sitemaps = [] @@ -190,7 +220,6 @@ class PlainTextSitemapParser(AbstractSitemapParser): """Plain text sitemap parser.""" def sitemap(self) -> AbstractSitemap: - story_urls = OrderedDict() for story_url in self._content.splitlines(): @@ -200,7 +229,7 @@ def sitemap(self) -> AbstractSitemap: if is_http_url(story_url): story_urls[story_url] = True else: - log.warning("Story URL {} doesn't look like an URL, skipping".format(story_url)) + log.warning(f"Story URL {story_url} doesn't look like an URL, skipping") pages = [] for page_url in story_urls.keys(): @@ -215,21 +244,33 @@ def sitemap(self) -> AbstractSitemap: class XMLSitemapParser(AbstractSitemapParser): """XML sitemap parser.""" - __XML_NAMESPACE_SEPARATOR = ' ' + __XML_NAMESPACE_SEPARATOR = " " __slots__ = [ - '_concrete_parser', + "_concrete_parser", ] - def __init__(self, url: str, content: str, recursion_level: int, web_client: AbstractWebClient): - super().__init__(url=url, content=content, recursion_level=recursion_level, web_client=web_client) + def __init__( + self, + url: str, + content: str, + recursion_level: int, + web_client: AbstractWebClient, + ): + super().__init__( + url=url, + content=content, + recursion_level=recursion_level, + web_client=web_client, + ) # Will be initialized when the type of sitemap is known self._concrete_parser = None def sitemap(self) -> AbstractSitemap: - - parser = xml.parsers.expat.ParserCreate(namespace_separator=self.__XML_NAMESPACE_SEPARATOR) + parser = xml.parsers.expat.ParserCreate( + namespace_separator=self.__XML_NAMESPACE_SEPARATOR + ) parser.StartElementHandler = self._xml_element_start parser.EndElementHandler = self._xml_element_end parser.CharacterDataHandler = self._xml_char_data @@ -240,12 +281,12 @@ def sitemap(self) -> AbstractSitemap: except Exception as ex: # Some sitemap XML files might end abruptly because webservers might be timing out on returning huge XML # files so don't return InvalidSitemap() but try to get as much pages as possible - log.error("Parsing sitemap from URL {} failed: {}".format(self._url, ex)) + log.error(f"Parsing sitemap from URL {self._url} failed: {ex}") if not self._concrete_parser: return InvalidSitemap( url=self._url, - reason="No parsers support sitemap from {}".format(self._url), + reason=f"No parsers support sitemap from {self._url}", ) return self._concrete_parser.sitemap() @@ -270,7 +311,7 @@ def __normalize_xml_element_name(cls, name: str): name_parts = name.split(cls.__XML_NAMESPACE_SEPARATOR) if len(name_parts) == 1: - namespace_url = '' + namespace_url = "" name = name_parts[0] elif len(name_parts) == 2: @@ -278,12 +319,14 @@ def __normalize_xml_element_name(cls, name: str): name = name_parts[1] else: - raise SitemapXMLParsingException("Unable to determine namespace for element '{}'".format(name)) + raise SitemapXMLParsingException( + f"Unable to determine namespace for element '{name}'" + ) - if '/sitemap/' in namespace_url: - name = 'sitemap:{}'.format(name) - elif '/sitemap-news/' in namespace_url: - name = 'news:{}'.format(name) + if "/sitemap/" in namespace_url: + name = f"sitemap:{name}" + elif "/sitemap-news/" in namespace_url: + name = f"news:{name}" else: # We don't care about the rest of the namespaces, so just keep the plain element name pass @@ -291,75 +334,73 @@ def __normalize_xml_element_name(cls, name: str): return name def _xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: - name = self.__normalize_xml_element_name(name) if self._concrete_parser: self._concrete_parser.xml_element_start(name=name, attrs=attrs) else: - # Root element -- initialize concrete parser - if name == 'sitemap:urlset': + if name == "sitemap:urlset": self._concrete_parser = PagesXMLSitemapParser( url=self._url, ) - elif name == 'sitemap:sitemapindex': + elif name == "sitemap:sitemapindex": self._concrete_parser = IndexXMLSitemapParser( url=self._url, web_client=self._web_client, recursion_level=self._recursion_level, ) - elif name == 'rss': + elif name == "rss": self._concrete_parser = PagesRSSSitemapParser( url=self._url, ) - elif name == 'feed': + elif name == "feed": self._concrete_parser = PagesAtomSitemapParser( url=self._url, ) else: - raise SitemapXMLParsingException("Unsupported root element '{}'.".format(name)) + raise SitemapXMLParsingException(f"Unsupported root element '{name}'.") def _xml_element_end(self, name: str) -> None: - name = self.__normalize_xml_element_name(name) if not self._concrete_parser: - raise SitemapXMLParsingException("Concrete sitemap parser should be set by now.") + raise SitemapXMLParsingException( + "Concrete sitemap parser should be set by now." + ) self._concrete_parser.xml_element_end(name=name) def _xml_char_data(self, data: str) -> None: - if not self._concrete_parser: - raise SitemapXMLParsingException("Concrete sitemap parser should be set by now.") + raise SitemapXMLParsingException( + "Concrete sitemap parser should be set by now." + ) self._concrete_parser.xml_char_data(data=data) -class AbstractXMLSitemapParser(object, metaclass=abc.ABCMeta): +class AbstractXMLSitemapParser(metaclass=abc.ABCMeta): """ Abstract XML sitemap parser. """ __slots__ = [ # URL of the sitemap that is being parsed - '_url', - + "_url", # Last encountered character data - '_last_char_data', - - '_last_handler_call_was_xml_char_data', + "_last_char_data", + "_last_handler_call_was_xml_char_data", ] def __init__(self, url: str): self._url = url - self._last_char_data = '' + self._last_char_data = "" self._last_handler_call_was_xml_char_data = False def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: @@ -368,7 +409,7 @@ def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: def xml_element_end(self, name: str) -> None: # End of any element always resets last encountered character data - self._last_char_data = '' + self._last_char_data = "" self._last_handler_call_was_xml_char_data = False def xml_char_data(self, data: str) -> None: @@ -392,11 +433,10 @@ class IndexXMLSitemapParser(AbstractXMLSitemapParser): """ __slots__ = [ - '_web_client', - '_recursion_level', - + "_web_client", + "_recursion_level", # List of sub-sitemap URLs found in this index sitemap - '_sub_sitemap_urls', + "_sub_sitemap_urls", ] def __init__(self, url: str, web_client: AbstractWebClient, recursion_level: int): @@ -407,11 +447,12 @@ def __init__(self, url: str, web_client: AbstractWebClient, recursion_level: int self._sub_sitemap_urls = [] def xml_element_end(self, name: str) -> None: - - if name == 'sitemap:loc': + if name == "sitemap:loc": sub_sitemap_url = html_unescape_strip(self._last_char_data) if not is_http_url(sub_sitemap_url): - log.warning("Sub-sitemap URL does not look like one: {}".format(sub_sitemap_url)) + log.warning( + f"Sub-sitemap URL does not look like one: {sub_sitemap_url}" + ) else: if sub_sitemap_url not in self._sub_sitemap_urls: @@ -420,21 +461,21 @@ def xml_element_end(self, name: str) -> None: super().xml_element_end(name=name) def sitemap(self) -> AbstractSitemap: - sub_sitemaps = [] for sub_sitemap_url in self._sub_sitemap_urls: - # URL might be invalid, or recursion limit might have been reached try: - fetcher = SitemapFetcher(url=sub_sitemap_url, - recursion_level=self._recursion_level + 1, - web_client=self._web_client) + fetcher = SitemapFetcher( + url=sub_sitemap_url, + recursion_level=self._recursion_level + 1, + web_client=self._web_client, + ) fetched_sitemap = fetcher.sitemap() except Exception as ex: fetched_sitemap = InvalidSitemap( url=sub_sitemap_url, - reason="Unable to add sub-sitemap from URL {}: {}".format(sub_sitemap_url, str(ex)), + reason=f"Unable to add sub-sitemap from URL {sub_sitemap_url}: {str(ex)}", ) sub_sitemaps.append(fetched_sitemap) @@ -449,22 +490,22 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser): Pages XML sitemap parser. """ - class Page(object): + class Page: """Simple data class for holding various properties for a single entry while parsing.""" __slots__ = [ - 'url', - 'last_modified', - 'change_frequency', - 'priority', - 'news_title', - 'news_publish_date', - 'news_publication_name', - 'news_publication_language', - 'news_access', - 'news_genres', - 'news_keywords', - 'news_stock_tickers', + "url", + "last_modified", + "change_frequency", + "priority", + "news_title", + "news_publish_date", + "news_publication_name", + "news_publication_language", + "news_access", + "news_genres", + "news_keywords", + "news_stock_tickers", ] def __init__(self): @@ -482,10 +523,12 @@ def __init__(self): self.news_stock_tickers = None def __hash__(self): - return hash(( - # Hash only the URL to be able to find unique ones - self.url, - )) + return hash( + ( + # Hash only the URL to be able to find unique ones + self.url, + ) + ) def page(self) -> Optional[SitemapPage]: """Return constructed sitemap page if one has been completed, otherwise None.""" @@ -506,7 +549,9 @@ def page(self) -> Optional[SitemapPage]: if SitemapPageChangeFrequency.has_value(change_frequency): change_frequency = SitemapPageChangeFrequency(change_frequency) else: - log.warning("Invalid change frequency, defaulting to 'always'.".format(change_frequency)) + log.warning( + "Invalid change frequency, defaulting to 'always'.".format() + ) change_frequency = SitemapPageChangeFrequency.ALWAYS assert isinstance(change_frequency, SitemapPageChangeFrequency) @@ -514,13 +559,16 @@ def page(self) -> Optional[SitemapPage]: if priority: priority = Decimal(priority) - comp_zero = priority.compare(Decimal('0.0')) - comp_one = priority.compare(Decimal('1.0')) - if comp_zero in (Decimal('0'), Decimal('1') and comp_one in (Decimal('0'), Decimal('-1'))): + comp_zero = priority.compare(Decimal("0.0")) + comp_one = priority.compare(Decimal("1.0")) + if comp_zero in ( + Decimal("0"), + Decimal("1") and comp_one in (Decimal("0"), Decimal("-1")), + ): # 0 <= priority <= 1 pass else: - log.warning("Priority is not within 0 and 1: {}".format(priority)) + log.warning(f"Priority is not within 0 and 1: {priority}") priority = SITEMAP_PAGE_DEFAULT_PRIORITY else: @@ -533,24 +581,26 @@ def page(self) -> Optional[SitemapPage]: news_publish_date = parse_iso8601_date(date_string=news_publish_date) news_publication_name = html_unescape_strip(self.news_publication_name) - news_publication_language = html_unescape_strip(self.news_publication_language) + news_publication_language = html_unescape_strip( + self.news_publication_language + ) news_access = html_unescape_strip(self.news_access) news_genres = html_unescape_strip(self.news_genres) if news_genres: - news_genres = [x.strip() for x in news_genres.split(',')] + news_genres = [x.strip() for x in news_genres.split(",")] else: news_genres = [] news_keywords = html_unescape_strip(self.news_keywords) if news_keywords: - news_keywords = [x.strip() for x in news_keywords.split(',')] + news_keywords = [x.strip() for x in news_keywords.split(",")] else: news_keywords = [] news_stock_tickers = html_unescape_strip(self.news_stock_tickers) if news_stock_tickers: - news_stock_tickers = [x.strip() for x in news_stock_tickers.split(',')] + news_stock_tickers = [x.strip() for x in news_stock_tickers.split(",")] else: news_stock_tickers = [] @@ -576,8 +626,8 @@ def page(self) -> Optional[SitemapPage]: ) __slots__ = [ - '_current_page', - '_pages', + "_current_page", + "_pages", ] def __init__(self, url: str): @@ -587,82 +637,82 @@ def __init__(self, url: str): self._pages = [] def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: - super().xml_element_start(name=name, attrs=attrs) - if name == 'sitemap:url': + if name == "sitemap:url": if self._current_page: - raise SitemapXMLParsingException("Page is expected to be unset by .") + raise SitemapXMLParsingException( + "Page is expected to be unset by ." + ) self._current_page = self.Page() def __require_last_char_data_to_be_set(self, name: str) -> None: if not self._last_char_data: raise SitemapXMLParsingException( - "Character data is expected to be set at the end of <{}>.".format(name) + f"Character data is expected to be set at the end of <{name}>." ) def xml_element_end(self, name: str) -> None: + if not self._current_page and name != "sitemap:urlset": + raise SitemapXMLParsingException( + f"Page is expected to be set at the end of <{name}>." + ) - if not self._current_page and name != 'sitemap:urlset': - raise SitemapXMLParsingException("Page is expected to be set at the end of <{}>.".format(name)) - - if name == 'sitemap:url': + if name == "sitemap:url": if self._current_page not in self._pages: self._pages.append(self._current_page) self._current_page = None else: - - if name == 'sitemap:loc': + if name == "sitemap:loc": # Every entry must have self.__require_last_char_data_to_be_set(name=name) self._current_page.url = self._last_char_data - elif name == 'sitemap:lastmod': + elif name == "sitemap:lastmod": # Element might be present but character data might be empty self._current_page.last_modified = self._last_char_data - elif name == 'sitemap:changefreq': + elif name == "sitemap:changefreq": # Element might be present but character data might be empty self._current_page.change_frequency = self._last_char_data - elif name == 'sitemap:priority': + elif name == "sitemap:priority": # Element might be present but character data might be empty self._current_page.priority = self._last_char_data - elif name == 'news:name': # news/publication/name + elif name == "news:name": # news/publication/name # Element might be present but character data might be empty self._current_page.news_publication_name = self._last_char_data - elif name == 'news:language': # news/publication/language + elif name == "news:language": # news/publication/language # Element might be present but character data might be empty self._current_page.news_publication_language = self._last_char_data - elif name == 'news:publication_date': + elif name == "news:publication_date": # Element might be present but character data might be empty self._current_page.news_publish_date = self._last_char_data - elif name == 'news:title': + elif name == "news:title": # Every Google News sitemap entry must have self.__require_last_char_data_to_be_set(name=name) self._current_page.news_title = self._last_char_data - elif name == 'news:access': + elif name == "news:access": # Element might be present but character data might be empty self._current_page.news_access = self._last_char_data - elif name == 'news:keywords': + elif name == "news:keywords": # Element might be present but character data might be empty self._current_page.news_keywords = self._last_char_data - elif name == 'news:stock_tickers': + elif name == "news:stock_tickers": # Element might be present but character data might be empty self._current_page.news_stock_tickers = self._last_char_data super().xml_element_end(name=name) def sitemap(self) -> AbstractSitemap: - pages = [] for page_row in self._pages: @@ -682,16 +732,16 @@ class PagesRSSSitemapParser(AbstractXMLSitemapParser): https://validator.w3.org/feed/docs/rss2.html """ - class Page(object): + class Page: """ Data class for holding various properties for a single RSS <item> while parsing. """ __slots__ = [ - 'link', - 'title', - 'description', - 'publication_date', + "link", + "title", + "description", + "publication_date", ] def __init__(self): @@ -701,10 +751,12 @@ def __init__(self): self.publication_date = None def __hash__(self): - return hash(( - # Hash only the URL - self.link, - )) + return hash( + ( + # Hash only the URL + self.link, + ) + ) def page(self) -> Optional[SitemapPage]: """Return constructed sitemap page if one has been completed, otherwise None.""" @@ -734,8 +786,8 @@ def page(self) -> Optional[SitemapPage]: ) __slots__ = [ - '_current_page', - '_pages', + "_current_page", + "_pages", ] def __init__(self, url: str): @@ -745,55 +797,52 @@ def __init__(self, url: str): self._pages = [] def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: - super().xml_element_start(name=name, attrs=attrs) - if name == 'item': + if name == "item": if self._current_page: - raise SitemapXMLParsingException("Page is expected to be unset by <item>.") + raise SitemapXMLParsingException( + "Page is expected to be unset by <item>." + ) self._current_page = self.Page() def __require_last_char_data_to_be_set(self, name: str) -> None: if not self._last_char_data: raise SitemapXMLParsingException( - "Character data is expected to be set at the end of <{}>.".format(name) + f"Character data is expected to be set at the end of <{name}>." ) def xml_element_end(self, name: str) -> None: - # If within <item> already if self._current_page: - - if name == 'item': + if name == "item": if self._current_page not in self._pages: self._pages.append(self._current_page) self._current_page = None else: - - if name == 'link': + if name == "link": # Every entry must have <link> self.__require_last_char_data_to_be_set(name=name) self._current_page.link = self._last_char_data - elif name == 'title': + elif name == "title": # Title (if set) can't be empty self.__require_last_char_data_to_be_set(name=name) self._current_page.title = self._last_char_data - elif name == 'description': + elif name == "description": # Description (if set) can't be empty self.__require_last_char_data_to_be_set(name=name) self._current_page.description = self._last_char_data - elif name == 'pubDate': + elif name == "pubDate": # Element might be present but character data might be empty self._current_page.publication_date = self._last_char_data super().xml_element_end(name=name) def sitemap(self) -> AbstractSitemap: - pages = [] for page_row in self._pages: @@ -817,14 +866,14 @@ class PagesAtomSitemapParser(AbstractXMLSitemapParser): # FIXME merge with RSS parser class as there are too many similarities - class Page(object): + class Page: """Data class for holding various properties for a single Atom <entry> while parsing.""" __slots__ = [ - 'link', - 'title', - 'description', - 'publication_date', + "link", + "title", + "description", + "publication_date", ] def __init__(self): @@ -834,10 +883,12 @@ def __init__(self): self.publication_date = None def __hash__(self): - return hash(( - # Hash only the URL - self.link, - )) + return hash( + ( + # Hash only the URL + self.link, + ) + ) def page(self) -> Optional[SitemapPage]: """Return constructed sitemap page if one has been completed, otherwise None.""" @@ -867,9 +918,9 @@ def page(self) -> Optional[SitemapPage]: ) __slots__ = [ - '_current_page', - '_pages', - '_last_link_rel_self_href', + "_current_page", + "_pages", + "_last_link_rel_self_href", ] def __init__(self, url: str): @@ -880,32 +931,33 @@ def __init__(self, url: str): self._last_link_rel_self_href = None def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: - super().xml_element_start(name=name, attrs=attrs) - if name == 'entry': + if name == "entry": if self._current_page: - raise SitemapXMLParsingException("Page is expected to be unset by <entry>.") + raise SitemapXMLParsingException( + "Page is expected to be unset by <entry>." + ) self._current_page = self.Page() - elif name == 'link': + elif name == "link": if self._current_page: - if attrs.get('rel', 'self').lower() == 'self' or self._last_link_rel_self_href is None: - self._last_link_rel_self_href = attrs.get('href', None) + if ( + attrs.get("rel", "self").lower() == "self" + or self._last_link_rel_self_href is None + ): + self._last_link_rel_self_href = attrs.get("href", None) def __require_last_char_data_to_be_set(self, name: str) -> None: if not self._last_char_data: raise SitemapXMLParsingException( - "Character data is expected to be set at the end of <{}>.".format(name) + f"Character data is expected to be set at the end of <{name}>." ) def xml_element_end(self, name: str) -> None: - # If within <entry> already if self._current_page: - - if name == 'entry': - + if name == "entry": if self._last_link_rel_self_href: self._current_page.link = self._last_link_rel_self_href self._last_link_rel_self_href = None @@ -916,22 +968,21 @@ def xml_element_end(self, name: str) -> None: self._current_page = None else: - - if name == 'title': + if name == "title": # Title (if set) can't be empty self.__require_last_char_data_to_be_set(name=name) self._current_page.title = self._last_char_data - elif name == 'tagline' or name == 'summary': + elif name == "tagline" or name == "summary": # Description (if set) can't be empty self.__require_last_char_data_to_be_set(name=name) self._current_page.description = self._last_char_data - elif name == 'issued' or name == 'published': + elif name == "issued" or name == "published": # Element might be present but character data might be empty self._current_page.publication_date = self._last_char_data - elif name == 'updated': + elif name == "updated": # No 'issued' or 'published' were set before if not self._current_page.publication_date: self._current_page.publication_date = self._last_char_data @@ -939,7 +990,6 @@ def xml_element_end(self, name: str) -> None: super().xml_element_end(name=name) def sitemap(self) -> AbstractSitemap: - pages = [] for page_row in self._pages: diff --git a/usp/helpers.py b/usp/helpers.py index daef862..4037faa 100644 --- a/usp/helpers.py +++ b/usp/helpers.py @@ -21,7 +21,7 @@ log = create_logger(__name__) -__URL_REGEX = re.compile(r'^https?://[^\s/$.?#].[^\s]*$', re.IGNORECASE) +__URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE) """Regular expression to match HTTP(s) URLs.""" @@ -39,10 +39,10 @@ def is_http_url(url: str) -> bool: log.debug("URL is empty") return False - log.debug("Testing if URL '{}' is HTTP(s) URL".format(url)) + log.debug(f"Testing if URL '{url}' is HTTP(s) URL") if not re.search(__URL_REGEX, url): - log.debug("URL '{}' does not match URL's regexp".format(url)) + log.debug(f"URL '{url}' does not match URL's regexp") return False try: @@ -51,17 +51,17 @@ def is_http_url(url: str) -> bool: _ = urlunparse(uri) except Exception as ex: - log.debug("Cannot parse URL {}: {}".format(url, ex)) + log.debug(f"Cannot parse URL {url}: {ex}") return False if not uri.scheme: - log.debug("Scheme is undefined for URL {}.".format(url)) + log.debug(f"Scheme is undefined for URL {url}.") return False - if not uri.scheme.lower() in ['http', 'https']: - log.debug("Scheme is not HTTP(s) for URL {}.".format(url)) + if uri.scheme.lower() not in ["http", "https"]: + log.debug(f"Scheme is not HTTP(s) for URL {url}.") return False if not uri.hostname: - log.debug("Host is undefined for URL {}.".format(url)) + log.debug(f"Host is undefined for URL {url}.") return False return True @@ -110,10 +110,12 @@ def parse_rfc2822_date(date_string: str) -> datetime.datetime: return parse_iso8601_date(date_string) -def get_url_retry_on_client_errors(url: str, - web_client: AbstractWebClient, - retry_count: int = 5, - sleep_between_retries: int = 1) -> AbstractWebClientResponse: +def get_url_retry_on_client_errors( + url: str, + web_client: AbstractWebClient, + retry_count: int = 5, + sleep_between_retries: int = 1, +) -> AbstractWebClientResponse: """ Fetch URL, retry on retryable errors. @@ -127,32 +129,30 @@ def get_url_retry_on_client_errors(url: str, response = None for retry in range(0, retry_count): - log.info("Fetching URL {}...".format(url)) + log.info(f"Fetching URL {url}...") response = web_client.get(url) if isinstance(response, WebClientErrorResponse): - log.warning( - "Request for URL {} failed: {}".format( - url, response.message() - ) - ) + log.warning(f"Request for URL {url} failed: {response.message()}") if response.retryable(): - log.info("Retrying URL {} in {} seconds...".format(url, sleep_between_retries)) + log.info(f"Retrying URL {url} in {sleep_between_retries} seconds...") time.sleep(sleep_between_retries) else: - log.info("Not retrying for URL {}".format(url)) + log.info(f"Not retrying for URL {url}") return response else: return response - log.info("Giving up on URL {}".format(url)) + log.info(f"Giving up on URL {url}") return response -def __response_is_gzipped_data(url: str, response: AbstractWebClientSuccessResponse) -> bool: +def __response_is_gzipped_data( + url: str, response: AbstractWebClientSuccessResponse +) -> bool: """ Return True if Response looks like it's gzipped. @@ -162,9 +162,9 @@ def __response_is_gzipped_data(url: str, response: AbstractWebClientSuccessRespo """ uri = urlparse(url) url_path = unquote_plus(uri.path) - content_type = response.header('content-type') or '' + content_type = response.header("content-type") or "" - if url_path.lower().endswith('.gz') or 'gzip' in content_type.lower(): + if url_path.lower().endswith(".gz") or "gzip" in content_type.lower(): return True else: @@ -183,15 +183,17 @@ def gunzip(data: bytes) -> bytes: raise GunzipException("Data is None.") if not isinstance(data, bytes): - raise GunzipException("Data is not bytes: %s" % str(data)) + raise GunzipException(f"Data is not bytes: {str(data)}") if len(data) == 0: - raise GunzipException("Data is empty (no way an empty string is a valid Gzip archive).") + raise GunzipException( + "Data is empty (no way an empty string is a valid Gzip archive)." + ) try: gunzipped_data = gzip_lib.decompress(data) except Exception as ex: - raise GunzipException("Unable to gunzip data: %s" % str(ex)) + raise GunzipException(f"Unable to gunzip data: {str(ex)}") if gunzipped_data is None: raise GunzipException("Gunzipped data is None.") @@ -202,7 +204,9 @@ def gunzip(data: bytes) -> bytes: return gunzipped_data -def ungzipped_response_content(url: str, response: AbstractWebClientSuccessResponse) -> str: +def ungzipped_response_content( + url: str, response: AbstractWebClientSuccessResponse +) -> str: """ Return HTTP response's decoded content, gunzip it if necessary. @@ -218,10 +222,12 @@ def ungzipped_response_content(url: str, response: AbstractWebClientSuccessRespo data = gunzip(data) except GunzipException as ex: # In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension - log.error("Unable to gunzip response {}, maybe it's a non-gzipped sitemap: {}".format(response, ex)) + log.error( + f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}" + ) # FIXME other encodings - data = data.decode('utf-8-sig', errors='replace') + data = data.decode("utf-8-sig", errors="replace") assert isinstance(data, str) @@ -241,17 +247,20 @@ def strip_url_to_homepage(url: str) -> str: try: uri = urlparse(url) assert uri.scheme, "Scheme must be set." - assert uri.scheme.lower() in ['http', 'https'], "Scheme must be http:// or https://" + assert uri.scheme.lower() in [ + "http", + "https", + ], "Scheme must be http:// or https://" uri = ( uri.scheme, uri.netloc, - '/', # path - '', # params - '', # query - '', # fragment + "/", # path + "", # params + "", # query + "", # fragment ) url = urlunparse(uri) except Exception as ex: - raise StripURLToHomepageException("Unable to parse URL {}: {}".format(url, ex)) + raise StripURLToHomepageException(f"Unable to parse URL {url}: {ex}") return url diff --git a/usp/log.py b/usp/log.py index 2f2eae8..f2ca0a8 100644 --- a/usp/log.py +++ b/usp/log.py @@ -3,26 +3,26 @@ import logging -class Logger(object): +class Logger: """ Logging helper class. """ __LEVELS = { - 'CRITICAL': logging.CRITICAL, - 'ERROR': logging.ERROR, - 'WARNING': logging.WARNING, - 'INFO': logging.INFO, - 'DEBUG': logging.DEBUG, + "CRITICAL": logging.CRITICAL, + "ERROR": logging.ERROR, + "WARNING": logging.WARNING, + "INFO": logging.INFO, + "DEBUG": logging.DEBUG, } """Valid logging levels and their "logging" counterparts.""" - __DEFAULT_LEVEL = 'INFO' + __DEFAULT_LEVEL = "INFO" """Default logging level.""" __slots__ = [ # "logging" object - '__l', + "__l", ] def __init__(self, name: str): diff --git a/usp/objects/page.py b/usp/objects/page.py index 7a9e1f7..e1451cc 100644 --- a/usp/objects/page.py +++ b/usp/objects/page.py @@ -5,35 +5,37 @@ from enum import Enum, unique from typing import List, Optional -SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal('0.5') +SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5") """Default sitemap page priority, as per the spec.""" -class SitemapNewsStory(object): +class SitemapNewsStory: """ Single story derived from Google News XML sitemap. """ __slots__ = [ - '__title', - '__publish_date', - '__publication_name', - '__publication_language', - '__access', - '__genres', - '__keywords', - '__stock_tickers', + "__title", + "__publish_date", + "__publication_name", + "__publication_language", + "__access", + "__genres", + "__keywords", + "__stock_tickers", ] - def __init__(self, - title: str, - publish_date: datetime.datetime, - publication_name: Optional[str] = None, - publication_language: Optional[str] = None, - access: Optional[str] = None, - genres: List[str] = None, - keywords: List[str] = None, - stock_tickers: List[str] = None): + def __init__( + self, + title: str, + publish_date: datetime.datetime, + publication_name: Optional[str] = None, + publication_language: Optional[str] = None, + access: Optional[str] = None, + genres: List[str] = None, + keywords: List[str] = None, + stock_tickers: List[str] = None, + ): """ Initialize a new Google News story. @@ -61,7 +63,7 @@ def __init__(self, def __eq__(self, other) -> bool: if not isinstance(other, SitemapNewsStory): - raise NotImplemented + raise NotImplementedError if self.title != other.title: return False @@ -90,30 +92,32 @@ def __eq__(self, other) -> bool: return True def __hash__(self): - return hash(( - self.title, - self.publish_date, - self.publication_name, - self.publication_language, - self.access, - self.genres, - self.keywords, - self.stock_tickers, - )) + return hash( + ( + self.title, + self.publish_date, + self.publication_name, + self.publication_language, + self.access, + self.genres, + self.keywords, + self.stock_tickers, + ) + ) def __repr__(self): return ( - "{self.__class__.__name__}(" - "title={self.title}, " - "publish_date={self.publish_date}, " - "publication_name={self.publication_name}, " - "publication_language={self.publication_language}, " - "access={self.access}, " - "genres={self.genres}, " - "keywords={self.keywords}, " - "stock_tickers={self.stock_tickers}" + f"{self.__class__.__name__}(" + f"title={self.title}, " + f"publish_date={self.publish_date}, " + f"publication_name={self.publication_name}, " + f"publication_language={self.publication_language}, " + f"access={self.access}, " + f"genres={self.genres}, " + f"keywords={self.keywords}, " + f"stock_tickers={self.stock_tickers}" ")" - ).format(self=self) + ) @property def title(self) -> str: @@ -198,13 +202,13 @@ def stock_tickers(self) -> List[str]: class SitemapPageChangeFrequency(Enum): """Change frequency of a sitemap URL.""" - ALWAYS = 'always' - HOURLY = 'hourly' - DAILY = 'daily' - WEEKLY = 'weekly' - MONTHLY = 'monthly' - YEARLY = 'yearly' - NEVER = 'never' + ALWAYS = "always" + HOURLY = "hourly" + DAILY = "daily" + WEEKLY = "weekly" + MONTHLY = "monthly" + YEARLY = "yearly" + NEVER = "never" @classmethod def has_value(cls, value: str) -> bool: @@ -212,23 +216,25 @@ def has_value(cls, value: str) -> bool: return any(value == item.value for item in cls) -class SitemapPage(object): +class SitemapPage: """Single sitemap-derived page.""" __slots__ = [ - '__url', - '__priority', - '__last_modified', - '__change_frequency', - '__news_story', + "__url", + "__priority", + "__last_modified", + "__change_frequency", + "__news_story", ] - def __init__(self, - url: str, - priority: Decimal = SITEMAP_PAGE_DEFAULT_PRIORITY, - last_modified: Optional[datetime.datetime] = None, - change_frequency: Optional[SitemapPageChangeFrequency] = None, - news_story: Optional[SitemapNewsStory] = None): + def __init__( + self, + url: str, + priority: Decimal = SITEMAP_PAGE_DEFAULT_PRIORITY, + last_modified: Optional[datetime.datetime] = None, + change_frequency: Optional[SitemapPageChangeFrequency] = None, + news_story: Optional[SitemapNewsStory] = None, + ): """ Initialize a new sitemap-derived page. @@ -246,7 +252,7 @@ def __init__(self, def __eq__(self, other) -> bool: if not isinstance(other, SitemapPage): - raise NotImplemented + raise NotImplementedError if self.url != other.url: return False @@ -266,21 +272,23 @@ def __eq__(self, other) -> bool: return True def __hash__(self): - return hash(( - # Hash only the URL to be able to find unique pages later on - self.url, - )) + return hash( + ( + # Hash only the URL to be able to find unique pages later on + self.url, + ) + ) def __repr__(self): return ( - "{self.__class__.__name__}(" - "url={self.url}, " - "priority={self.priority}, " - "last_modified={self.last_modified}, " - "change_frequency={self.change_frequency}, " - "news_story={self.news_story}" + f"{self.__class__.__name__}(" + f"url={self.url}, " + f"priority={self.priority}, " + f"last_modified={self.last_modified}, " + f"change_frequency={self.change_frequency}, " + f"news_story={self.news_story}" ")" - ).format(self=self) + ) @property def url(self) -> str: diff --git a/usp/objects/sitemap.py b/usp/objects/sitemap.py index 12018c5..612b537 100644 --- a/usp/objects/sitemap.py +++ b/usp/objects/sitemap.py @@ -9,13 +9,13 @@ from .page import SitemapPage -class AbstractSitemap(object, metaclass=abc.ABCMeta): +class AbstractSitemap(metaclass=abc.ABCMeta): """ Abstract sitemap. """ __slots__ = [ - '__url', + "__url", ] def __init__(self, url: str): @@ -28,7 +28,7 @@ def __init__(self, url: str): def __eq__(self, other) -> bool: if not isinstance(other, AbstractSitemap): - raise NotImplemented + raise NotImplementedError if self.url != other.url: return False @@ -36,16 +36,10 @@ def __eq__(self, other) -> bool: return True def __hash__(self): - return hash(( - self.url, - )) + return hash((self.url,)) def __repr__(self): - return ( - "{self.__class__.__name__}(" - "url={self.url}" - ")" - ).format(self=self) + return f"{self.__class__.__name__}(" f"url={self.url}" ")" @property def url(self) -> str: @@ -70,7 +64,7 @@ class InvalidSitemap(AbstractSitemap): """Invalid sitemap, e.g. the one that can't be parsed.""" __slots__ = [ - '__reason', + "__reason", ] def __init__(self, url: str, reason: str): @@ -85,7 +79,7 @@ def __init__(self, url: str, reason: str): def __eq__(self, other) -> bool: if not isinstance(other, InvalidSitemap): - raise NotImplemented + raise NotImplementedError if self.url != other.url: return False @@ -97,11 +91,11 @@ def __eq__(self, other) -> bool: def __repr__(self): return ( - "{self.__class__.__name__}(" - "url={self.url}, " - "reason={self.reason}" + f"{self.__class__.__name__}(" + f"url={self.url}, " + f"reason={self.reason}" ")" - ).format(self=self) + ) @property def reason(self) -> str: @@ -125,7 +119,7 @@ class AbstractPagesSitemap(AbstractSitemap, metaclass=abc.ABCMeta): """Abstract sitemap that contains URLs to pages.""" __slots__ = [ - '__pages_temp_file_path', + "__pages_temp_file_path", ] def __init__(self, url: str, pages: List[SitemapPage]): @@ -138,7 +132,7 @@ def __init__(self, url: str, pages: List[SitemapPage]): super().__init__(url=url) temp_file, self.__pages_temp_file_path = tempfile.mkstemp() - with os.fdopen(temp_file, 'wb') as tmp: + with os.fdopen(temp_file, "wb") as tmp: pickle.dump(pages, tmp, protocol=pickle.HIGHEST_PROTOCOL) def __del__(self): @@ -146,7 +140,7 @@ def __del__(self): def __eq__(self, other) -> bool: if not isinstance(other, AbstractPagesSitemap): - raise NotImplemented + raise NotImplementedError if self.url != other.url: return False @@ -158,11 +152,8 @@ def __eq__(self, other) -> bool: def __repr__(self): return ( - "{self.__class__.__name__}(" - "url={self.url}, " - "pages={self.pages}" - ")" - ).format(self=self) + f"{self.__class__.__name__}(" f"url={self.url}, " f"pages={self.pages}" ")" + ) @property def pages(self) -> List[SitemapPage]: @@ -171,7 +162,7 @@ def pages(self) -> List[SitemapPage]: :return: List of pages found in a sitemap. """ - with open(self.__pages_temp_file_path, 'rb') as tmp: + with open(self.__pages_temp_file_path, "rb") as tmp: pages = pickle.load(tmp) return pages @@ -181,14 +172,14 @@ def all_pages(self) -> Iterator[SitemapPage]: :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). """ - for page in self.pages: - yield page + yield from self.pages class PagesXMLSitemap(AbstractPagesSitemap): """ XML sitemap that contains URLs to pages. """ + pass @@ -196,6 +187,7 @@ class PagesTextSitemap(AbstractPagesSitemap): """ Plain text sitemap that contains URLs to pages. """ + pass @@ -203,6 +195,7 @@ class PagesRSSSitemap(AbstractPagesSitemap): """ RSS 2.0 sitemap that contains URLs to pages. """ + pass @@ -210,6 +203,7 @@ class PagesAtomSitemap(AbstractPagesSitemap): """ RSS 0.3 / 1.0 sitemap that contains URLs to pages. """ + pass @@ -219,7 +213,7 @@ class AbstractIndexSitemap(AbstractSitemap): """ __slots__ = [ - '__sub_sitemaps', + "__sub_sitemaps", ] def __init__(self, url: str, sub_sitemaps: List[AbstractSitemap]): @@ -234,7 +228,7 @@ def __init__(self, url: str, sub_sitemaps: List[AbstractSitemap]): def __eq__(self, other) -> bool: if not isinstance(other, AbstractIndexSitemap): - raise NotImplemented + raise NotImplementedError if self.url != other.url: return False @@ -246,11 +240,11 @@ def __eq__(self, other) -> bool: def __repr__(self): return ( - "{self.__class__.__name__}(" - "url={self.url}, " - "sub_sitemaps={self.sub_sitemaps}" + f"{self.__class__.__name__}(" + f"url={self.url}, " + f"sub_sitemaps={self.sub_sitemaps}" ")" - ).format(self=self) + ) @property def sub_sitemaps(self) -> List[AbstractSitemap]: @@ -268,14 +262,14 @@ def all_pages(self) -> Iterator[SitemapPage]: :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). """ for sub_sitemap in self.sub_sitemaps: - for page in sub_sitemap.all_pages(): - yield page + yield from self.all_pages() class IndexWebsiteSitemap(AbstractIndexSitemap): """ Website's root sitemaps, including robots.txt and extra ones. """ + pass @@ -283,6 +277,7 @@ class IndexXMLSitemap(AbstractIndexSitemap): """ XML sitemap with URLs to other sitemaps. """ + pass @@ -290,4 +285,5 @@ class IndexRobotsTxtSitemap(AbstractIndexSitemap): """ robots.txt sitemap with URLs to other sitemaps. """ + pass diff --git a/usp/tree.py b/usp/tree.py index 26431bb..5759355 100644 --- a/usp/tree.py +++ b/usp/tree.py @@ -6,31 +6,38 @@ from .fetch_parse import SitemapFetcher from .helpers import is_http_url, strip_url_to_homepage from .log import create_logger -from .objects.sitemap import AbstractSitemap, InvalidSitemap, IndexWebsiteSitemap, IndexRobotsTxtSitemap +from .objects.sitemap import ( + AbstractSitemap, + InvalidSitemap, + IndexWebsiteSitemap, + IndexRobotsTxtSitemap, +) from .web_client.abstract_client import AbstractWebClient log = create_logger(__name__) _UNPUBLISHED_SITEMAP_PATHS = { - 'sitemap.xml', - 'sitemap.xml.gz', - 'sitemap_index.xml', - 'sitemap-index.xml', - 'sitemap_index.xml.gz', - 'sitemap-index.xml.gz', - '.sitemap.xml', - 'sitemap', - 'admin/config/search/xmlsitemap', - 'sitemap/sitemap-index.xml', - 'sitemap_news.xml', - 'sitemap-news.xml', - 'sitemap_news.xml.gz', - 'sitemap-news.xml.gz', + "sitemap.xml", + "sitemap.xml.gz", + "sitemap_index.xml", + "sitemap-index.xml", + "sitemap_index.xml.gz", + "sitemap-index.xml.gz", + ".sitemap.xml", + "sitemap", + "admin/config/search/xmlsitemap", + "sitemap/sitemap-index.xml", + "sitemap_news.xml", + "sitemap-news.xml", + "sitemap_news.xml.gz", + "sitemap-news.xml.gz", } """Paths which are not exposed in robots.txt but might still contain a sitemap.""" -def sitemap_tree_for_homepage(homepage_url: str, web_client: Optional[AbstractWebClient] = None) -> AbstractSitemap: +def sitemap_tree_for_homepage( + homepage_url: str, web_client: Optional[AbstractWebClient] = None +) -> AbstractSitemap: """ Using a homepage URL, fetch the tree of sitemaps and pages listed in them. @@ -40,20 +47,24 @@ def sitemap_tree_for_homepage(homepage_url: str, web_client: Optional[AbstractWe """ if not is_http_url(homepage_url): - raise SitemapException("URL {} is not a HTTP(s) URL.".format(homepage_url)) + raise SitemapException(f"URL {homepage_url} is not a HTTP(s) URL.") stripped_homepage_url = strip_url_to_homepage(url=homepage_url) if homepage_url != stripped_homepage_url: - log.warning("Assuming that the homepage of {} is {}".format(homepage_url, stripped_homepage_url)) + log.warning( + f"Assuming that the homepage of {homepage_url} is {stripped_homepage_url}" + ) homepage_url = stripped_homepage_url - if not homepage_url.endswith('/'): - homepage_url += '/' - robots_txt_url = homepage_url + 'robots.txt' + if not homepage_url.endswith("/"): + homepage_url += "/" + robots_txt_url = homepage_url + "robots.txt" sitemaps = [] - robots_txt_fetcher = SitemapFetcher(url=robots_txt_url, web_client=web_client, recursion_level=0) + robots_txt_fetcher = SitemapFetcher( + url=robots_txt_url, web_client=web_client, recursion_level=0 + ) robots_txt_sitemap = robots_txt_fetcher.sitemap() if not isinstance(robots_txt_sitemap, InvalidSitemap): sitemaps.append(robots_txt_sitemap) @@ -68,7 +79,6 @@ def sitemap_tree_for_homepage(homepage_url: str, web_client: Optional[AbstractWe # Don't refetch URLs already found in robots.txt if unpublished_sitemap_url not in sitemap_urls_found_in_robots_txt: - unpublished_sitemap_fetcher = SitemapFetcher( url=unpublished_sitemap_url, web_client=web_client, diff --git a/usp/web_client/abstract_client.py b/usp/web_client/abstract_client.py index bcb06ef..54299c1 100644 --- a/usp/web_client/abstract_client.py +++ b/usp/web_client/abstract_client.py @@ -5,76 +5,59 @@ from typing import Optional RETRYABLE_HTTP_STATUS_CODES = { - # Some servers return "400 Bad Request" initially but upon retry start working again, no idea why int(HTTPStatus.BAD_REQUEST), - # If we timed out requesting stuff, we can just try again int(HTTPStatus.REQUEST_TIMEOUT), - # If we got rate limited, it makes sense to wait a bit int(HTTPStatus.TOO_MANY_REQUESTS), - # Server might be just fine on a subsequent attempt int(HTTPStatus.INTERNAL_SERVER_ERROR), - # Upstream might reappear on a retry int(HTTPStatus.BAD_GATEWAY), - # Service might become available again on a retry int(HTTPStatus.SERVICE_UNAVAILABLE), - # Upstream might reappear on a retry int(HTTPStatus.GATEWAY_TIMEOUT), - # (unofficial) 509 Bandwidth Limit Exceeded (Apache Web Server/cPanel) 509, - # (unofficial) 598 Network read timeout error 598, - # (unofficial, nginx) 499 Client Closed Request 499, - # (unofficial, Cloudflare) 520 Unknown Error 520, - # (unofficial, Cloudflare) 521 Web Server Is Down 521, - # (unofficial, Cloudflare) 522 Connection Timed Out 522, - # (unofficial, Cloudflare) 523 Origin Is Unreachable 523, - # (unofficial, Cloudflare) 524 A Timeout Occurred 524, - # (unofficial, Cloudflare) 525 SSL Handshake Failed 525, - # (unofficial, Cloudflare) 526 Invalid SSL Certificate 526, - # (unofficial, Cloudflare) 527 Railgun Error 527, - # (unofficial, Cloudflare) 530 Origin DNS Error 530, - } """HTTP status codes on which a request should be retried.""" -class AbstractWebClientResponse(object, metaclass=abc.ABCMeta): +class AbstractWebClientResponse(metaclass=abc.ABCMeta): """ Abstract response. """ + pass -class AbstractWebClientSuccessResponse(AbstractWebClientResponse, metaclass=abc.ABCMeta): +class AbstractWebClientSuccessResponse( + AbstractWebClientResponse, metaclass=abc.ABCMeta +): """ Successful response. """ @@ -123,8 +106,8 @@ class WebClientErrorResponse(AbstractWebClientResponse, metaclass=abc.ABCMeta): """ __slots__ = [ - '_message', - '_retryable', + "_message", + "_retryable", ] def __init__(self, message: str, retryable: bool): @@ -155,7 +138,7 @@ def retryable(self) -> bool: return self._retryable -class AbstractWebClient(object, metaclass=abc.ABCMeta): +class AbstractWebClient(metaclass=abc.ABCMeta): """ Abstract web client to be used by the sitemap fetcher. """ diff --git a/usp/web_client/requests_client.py b/usp/web_client/requests_client.py index 30d9078..35aa6d6 100644 --- a/usp/web_client/requests_client.py +++ b/usp/web_client/requests_client.py @@ -21,11 +21,15 @@ class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse): """ __slots__ = [ - '__requests_response', - '__max_response_data_length', + "__requests_response", + "__max_response_data_length", ] - def __init__(self, requests_response: requests.Response, max_response_data_length: Optional[int] = None): + def __init__( + self, + requests_response: requests.Response, + max_response_data_length: Optional[int] = None, + ): self.__requests_response = requests_response self.__max_response_data_length = max_response_data_length @@ -43,7 +47,7 @@ def header(self, case_insensitive_name: str) -> Optional[str]: def raw_data(self) -> bytes: if self.__max_response_data_length: - data = self.__requests_response.content[:self.__max_response_data_length] + data = self.__requests_response.content[: self.__max_response_data_length] else: data = self.__requests_response.content @@ -54,13 +58,14 @@ class RequestsWebClientErrorResponse(WebClientErrorResponse): """ requests-based error response. """ + pass class RequestsWebClient(AbstractWebClient): """requests-based web client to be used by the sitemap fetcher.""" - __USER_AGENT = 'ultimate_sitemap_parser/{}'.format(__version__) + __USER_AGENT = f"ultimate_sitemap_parser/{__version__}" __HTTP_REQUEST_TIMEOUT = 60 """ @@ -70,9 +75,9 @@ class RequestsWebClient(AbstractWebClient): """ __slots__ = [ - '__max_response_data_length', - '__timeout', - '__proxies', + "__max_response_data_length", + "__timeout", + "__proxies", ] def __init__(self, verify=True): @@ -92,7 +97,7 @@ def set_proxies(self, proxies: Dict[str, str]) -> None: * keys are schemes, e.g. "http" or "https"; * values are "scheme://user:password@host:port/". - + For example: proxies = {'http': 'http://user:pass@10.10.1.10:3128/'} @@ -109,7 +114,7 @@ def get(self, url: str) -> AbstractWebClientResponse: url, timeout=self.__timeout, stream=True, - headers={'User-Agent': self.__USER_AGENT}, + headers={"User-Agent": self.__USER_AGENT}, proxies=self.__proxies, verify=self.__verify, ) @@ -122,17 +127,19 @@ def get(self, url: str) -> AbstractWebClientResponse: return RequestsWebClientErrorResponse(message=str(ex), retryable=False) else: - if 200 <= response.status_code < 300: return RequestsWebClientSuccessResponse( requests_response=response, max_response_data_length=self.__max_response_data_length, ) else: - - message = '{} {}'.format(response.status_code, response.reason) + message = f"{response.status_code} {response.reason}" if response.status_code in RETRYABLE_HTTP_STATUS_CODES: - return RequestsWebClientErrorResponse(message=message, retryable=True) + return RequestsWebClientErrorResponse( + message=message, retryable=True + ) else: - return RequestsWebClientErrorResponse(message=message, retryable=False) + return RequestsWebClientErrorResponse( + message=message, retryable=False + ) From c19bc4d79d2e5b68d682eff817be30776480ff2f Mon Sep 17 00:00:00 2001 From: Freddy Heppell <freddy@freddyheppell.com> Date: Fri, 16 Aug 2024 21:47:54 +0100 Subject: [PATCH 26/79] Change to use requests_mock fixture --- poetry.lock | 2 +- pyproject.toml | 5 +- tests/conftest.py | 0 tests/test_tree.py | 2305 ++++++++++++++++++++-------------------- usp/objects/sitemap.py | 2 +- 5 files changed, 1134 insertions(+), 1180 deletions(-) create mode 100644 tests/conftest.py diff --git a/poetry.lock b/poetry.lock index 8bc3be3..70727a3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -326,4 +326,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "521cedf7d4b94b7856f3f291ec2b9a594e71aaaf21ab207cbd020f91c250d633" +content-hash = "d194c98c146afb5b1110c889dd9746762a7ed79baa7144b0ca9cab96d1995ca6" diff --git a/pyproject.toml b/pyproject.toml index ba8d508..d267a85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ requests = ">=2.2.1" [tool.poetry.group.dev.dependencies] requests-mock = ">=1.6.0,<2.0" -pytest = ">=2.8" +pytest = "^8.3.0" ruff = "^0.6.1" [build-system] @@ -44,5 +44,6 @@ select = [ "E7", "E9", "F", - "UP" + "UP", + "PT" ] \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_tree.py b/tests/test_tree.py index d6cc205..7168e04 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -3,9 +3,7 @@ import textwrap from decimal import Decimal from email.utils import format_datetime -from unittest import TestCase - -import requests_mock +import requests_mock as rq_mock from dateutil.tz import tzoffset from tests.helpers import gzip @@ -36,7 +34,7 @@ log = create_logger(__name__) -class TestSitemapTree(TestCase): +class TestSitemapTree: TEST_BASE_URL = "http://test_ultimate-sitemap-parser.com" # mocked by HTTPretty # Publication / "last modified" date @@ -61,7 +59,7 @@ class TestSitemapTree(TestCase): @staticmethod def fallback_to_404_not_found_matcher(request): """Reply with "404 Not Found" to unmatched URLs instead of throwing NoMockAddress.""" - return requests_mock.create_response( + return rq_mock.create_response( request, status_code=404, reason="Not Found", @@ -70,894 +68,875 @@ def fallback_to_404_not_found_matcher(request): ) # noinspection DuplicatedCode - def test_sitemap_tree_for_homepage(self): + def test_sitemap_tree_for_homepage(self, requests_mock): """Test sitemap_tree_for_homepage().""" - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap_pages.xml - - # Intentionally spelled as "Site-map" as Google tolerates this: - # https://github.com/google/robotstxt/blob/master/robots.cc#L703 - Site-map: {self.TEST_BASE_URL}/sitemap_news_index_1.xml - """ - ).strip(), - ) - - # One sitemap for random static pages - m.get( - self.TEST_BASE_URL + "/sitemap_pages.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - <?xml version="1.0" encoding="UTF-8"?> - <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> - <url> - <loc>{self.TEST_BASE_URL}/about.html</loc> - <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> - <changefreq>monthly</changefreq> - <priority>0.8</priority> - </url> - <url> - <loc>{self.TEST_BASE_URL}/contact.html</loc> - <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> - - <!-- Invalid change frequency --> - <changefreq>when we feel like it</changefreq> - - <!-- Invalid priority --> - <priority>1.1</priority> - - </url> - </urlset> - """ - ).strip(), - ) - - # Index sitemap pointing to sitemaps with stories - m.get( - self.TEST_BASE_URL + "/sitemap_news_index_1.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - <?xml version="1.0" encoding="UTF-8"?> - <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> - <sitemap> - <loc>{self.TEST_BASE_URL}/sitemap_news_1.xml</loc> - <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> - </sitemap> - <sitemap> - <loc>{self.TEST_BASE_URL}/sitemap_news_index_2.xml</loc> - <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> - </sitemap> - </sitemapindex> - """ - ).strip(), - ) - - # First sitemap with actual stories - m.get( - self.TEST_BASE_URL + "/sitemap_news_1.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - <?xml version="1.0" encoding="UTF-8"?> - <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" - xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" - xmlns:xhtml="http://www.w3.org/1999/xhtml"> - - <url> - <loc>{self.TEST_BASE_URL}/news/foo.html</loc> - - <!-- Element present but empty --> - <lastmod /> - - <!-- Some other XML namespace --> - <xhtml:link rel="alternate" - media="only screen and (max-width: 640px)" - href="{self.TEST_BASE_URL}/news/foo.html?mobile=1" /> - - <news:news> - <news:publication> - <news:name>{self.TEST_PUBLICATION_NAME}</news:name> - <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> - </news:publication> - <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> - <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> - </news:news> - </url> - - <!-- Has a duplicate story in /sitemap_news_2.xml --> - <url> - <loc>{self.TEST_BASE_URL}/news/bar.html</loc> - <xhtml:link rel="alternate" - media="only screen and (max-width: 640px)" - href="{self.TEST_BASE_URL}/news/bar.html?mobile=1" /> - <news:news> - <news:publication> - <news:name>{self.TEST_PUBLICATION_NAME}</news:name> - <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> - </news:publication> - <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> - <news:title>Bar & bar</news:title> - </news:news> - </url> - - </urlset> - """ - ).strip(), - ) - - # Another index sitemap pointing to a second sitemaps with stories - m.get( - self.TEST_BASE_URL + "/sitemap_news_index_2.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - <?xml version="1.0" encoding="UTF-8"?> - <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> - - <sitemap> - <!-- Extra whitespace added around URL --> - <loc> {self.TEST_BASE_URL}/sitemap_news_2.xml </loc> - <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> - </sitemap> - - <!-- Nonexistent sitemap --> - <sitemap> - <loc>{self.TEST_BASE_URL}/sitemap_news_missing.xml</loc> - <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> - </sitemap> - - </sitemapindex> - """ - ).strip(), - ) - - # Second sitemap with actual stories - m.get( - self.TEST_BASE_URL + "/sitemap_news_2.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - <?xml version="1.0" encoding="UTF-8"?> - <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" - xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" - xmlns:xhtml="http://www.w3.org/1999/xhtml"> - - <!-- Has a duplicate story in /sitemap_news_1.xml --> - <url> - <!-- Extra whitespace added around URL --> - <loc> {self.TEST_BASE_URL}/news/bar.html </loc> - <xhtml:link rel="alternate" - media="only screen and (max-width: 640px)" - href="{self.TEST_BASE_URL}/news/bar.html?mobile=1#fragment_is_to_be_removed" /> - <news:news> - <news:publication> - <news:name>{self.TEST_PUBLICATION_NAME}</news:name> - <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> - </news:publication> - <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> - - <tag_without_inner_character_data name="value" /> - - <news:title>Bar & bar</news:title> - </news:news> - </url> - - <url> - <loc>{self.TEST_BASE_URL}/news/baz.html</loc> - <xhtml:link rel="alternate" - media="only screen and (max-width: 640px)" - href="{self.TEST_BASE_URL}/news/baz.html?mobile=1" /> - <news:news> - <news:publication> - <news:name>{self.TEST_PUBLICATION_NAME}</news:name> - <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> - </news:publication> - <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> - <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> - </news:news> - </url> - - </urlset> - """ - ).strip(), - ) - - # Nonexistent sitemap - m.get( - self.TEST_BASE_URL + "/sitemap_news_missing.xml", - status_code=404, - reason="Not Found", - headers={"Content-Type": "text/html"}, - text="<h1>404 Not Found!</h1>", - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[ - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_pages.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/about.html", - last_modified=self.TEST_DATE_DATETIME, - news_story=None, - change_frequency=SitemapPageChangeFrequency.MONTHLY, - priority=Decimal("0.8"), - ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/contact.html", - last_modified=self.TEST_DATE_DATETIME, - news_story=None, - # Invalid input -- should be reset to "always" - change_frequency=SitemapPageChangeFrequency.ALWAYS, - # Invalid input -- should be reset to 0.5 (the default as per the spec) - priority=Decimal("0.5"), - ), - ], - ), - IndexXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_index_1.xml", - sub_sitemaps=[ - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_1.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/foo.html", - news_story=SitemapNewsStory( - title="Foo <foo>", - publish_date=self.TEST_DATE_DATETIME, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - ), + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_pages.xml + + # Intentionally spelled as "Site-map" as Google tolerates this: + # https://github.com/google/robotstxt/blob/master/robots.cc#L703 + Site-map: {self.TEST_BASE_URL}/sitemap_news_index_1.xml + """ + ).strip(), + ) + + # One sitemap for random static pages + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_pages.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + <?xml version="1.0" encoding="UTF-8"?> + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + <url> + <loc>{self.TEST_BASE_URL}/about.html</loc> + <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> + <changefreq>monthly</changefreq> + <priority>0.8</priority> + </url> + <url> + <loc>{self.TEST_BASE_URL}/contact.html</loc> + <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> + + <!-- Invalid change frequency --> + <changefreq>when we feel like it</changefreq> + + <!-- Invalid priority --> + <priority>1.1</priority> + + </url> + </urlset> + """ + ).strip(), + ) + + # Index sitemap pointing to sitemaps with stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_index_1.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + <?xml version="1.0" encoding="UTF-8"?> + <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + <sitemap> + <loc>{self.TEST_BASE_URL}/sitemap_news_1.xml</loc> + <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> + </sitemap> + <sitemap> + <loc>{self.TEST_BASE_URL}/sitemap_news_index_2.xml</loc> + <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> + </sitemap> + </sitemapindex> + """ + ).strip(), + ) + + # First sitemap with actual stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_1.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + <?xml version="1.0" encoding="UTF-8"?> + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" + xmlns:xhtml="http://www.w3.org/1999/xhtml"> + + <url> + <loc>{self.TEST_BASE_URL}/news/foo.html</loc> + + <!-- Element present but empty --> + <lastmod /> + + <!-- Some other XML namespace --> + <xhtml:link rel="alternate" + media="only screen and (max-width: 640px)" + href="{self.TEST_BASE_URL}/news/foo.html?mobile=1" /> + + <news:news> + <news:publication> + <news:name>{self.TEST_PUBLICATION_NAME}</news:name> + <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> + </news:publication> + <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> + <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> + </news:news> + </url> + + <!-- Has a duplicate story in /sitemap_news_2.xml --> + <url> + <loc>{self.TEST_BASE_URL}/news/bar.html</loc> + <xhtml:link rel="alternate" + media="only screen and (max-width: 640px)" + href="{self.TEST_BASE_URL}/news/bar.html?mobile=1" /> + <news:news> + <news:publication> + <news:name>{self.TEST_PUBLICATION_NAME}</news:name> + <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> + </news:publication> + <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> + <news:title>Bar & bar</news:title> + </news:news> + </url> + + </urlset> + """ + ).strip(), + ) + + # Another index sitemap pointing to a second sitemaps with stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_index_2.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + <?xml version="1.0" encoding="UTF-8"?> + <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + + <sitemap> + <!-- Extra whitespace added around URL --> + <loc> {self.TEST_BASE_URL}/sitemap_news_2.xml </loc> + <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> + </sitemap> + + <!-- Nonexistent sitemap --> + <sitemap> + <loc>{self.TEST_BASE_URL}/sitemap_news_missing.xml</loc> + <lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod> + </sitemap> + + </sitemapindex> + """ + ).strip(), + ) + + # Second sitemap with actual stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_2.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + <?xml version="1.0" encoding="UTF-8"?> + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" + xmlns:xhtml="http://www.w3.org/1999/xhtml"> + + <!-- Has a duplicate story in /sitemap_news_1.xml --> + <url> + <!-- Extra whitespace added around URL --> + <loc> {self.TEST_BASE_URL}/news/bar.html </loc> + <xhtml:link rel="alternate" + media="only screen and (max-width: 640px)" + href="{self.TEST_BASE_URL}/news/bar.html?mobile=1#fragment_is_to_be_removed" /> + <news:news> + <news:publication> + <news:name>{self.TEST_PUBLICATION_NAME}</news:name> + <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> + </news:publication> + <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> + + <tag_without_inner_character_data name="value" /> + + <news:title>Bar & bar</news:title> + </news:news> + </url> + + <url> + <loc>{self.TEST_BASE_URL}/news/baz.html</loc> + <xhtml:link rel="alternate" + media="only screen and (max-width: 640px)" + href="{self.TEST_BASE_URL}/news/baz.html?mobile=1" /> + <news:news> + <news:publication> + <news:name>{self.TEST_PUBLICATION_NAME}</news:name> + <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> + </news:publication> + <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> + <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> + </news:news> + </url> + + </urlset> + """ + ).strip(), + ) + + # Nonexistent sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_missing.xml", + status_code=404, + reason="Not Found", + headers={"Content-Type": "text/html"}, + text="<h1>404 Not Found!</h1>", + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[ + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_pages.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/about.html", + last_modified=self.TEST_DATE_DATETIME, + news_story=None, + change_frequency=SitemapPageChangeFrequency.MONTHLY, + priority=Decimal("0.8"), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/contact.html", + last_modified=self.TEST_DATE_DATETIME, + news_story=None, + # Invalid input -- should be reset to "always" + change_frequency=SitemapPageChangeFrequency.ALWAYS, + # Invalid input -- should be reset to 0.5 (the default as per the spec) + priority=Decimal("0.5"), + ), + ], + ), + IndexXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_index_1.xml", + sub_sitemaps=[ + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_1.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/foo.html", + news_story=SitemapNewsStory( + title="Foo <foo>", + publish_date=self.TEST_DATE_DATETIME, + publication_name=self.TEST_PUBLICATION_NAME, + publication_language=self.TEST_PUBLICATION_LANGUAGE, ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/bar.html", - news_story=SitemapNewsStory( - title="Bar & bar", - publish_date=self.TEST_DATE_DATETIME, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/bar.html", + news_story=SitemapNewsStory( + title="Bar & bar", + publish_date=self.TEST_DATE_DATETIME, + publication_name=self.TEST_PUBLICATION_NAME, + publication_language=self.TEST_PUBLICATION_LANGUAGE, ), - ], - ), - IndexXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_index_2.xml", - sub_sitemaps=[ - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_2.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/bar.html", - news_story=SitemapNewsStory( - title="Bar & bar", - publish_date=self.TEST_DATE_DATETIME, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - ), + ), + ], + ), + IndexXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_index_2.xml", + sub_sitemaps=[ + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_2.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/bar.html", + news_story=SitemapNewsStory( + title="Bar & bar", + publish_date=self.TEST_DATE_DATETIME, + publication_name=self.TEST_PUBLICATION_NAME, + publication_language=self.TEST_PUBLICATION_LANGUAGE, ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/baz.html", - news_story=SitemapNewsStory( - title="Bąž", - publish_date=self.TEST_DATE_DATETIME, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/baz.html", + news_story=SitemapNewsStory( + title="Bąž", + publish_date=self.TEST_DATE_DATETIME, + publication_name=self.TEST_PUBLICATION_NAME, + publication_language=self.TEST_PUBLICATION_LANGUAGE, ), - ], - ), - InvalidSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_missing.xml", - reason=( - f"Unable to fetch sitemap from {self.TEST_BASE_URL}/sitemap_news_missing.xml: " - "404 Not Found" ), + ], + ), + InvalidSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_missing.xml", + reason=( + f"Unable to fetch sitemap from {self.TEST_BASE_URL}/sitemap_news_missing.xml: " + "404 Not Found" ), - ], - ), - ], - ), - ], - ) - ], - ) + ), + ], + ), + ], + ), + ], + ) + ], + ) - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - expected_lines = str(expected_sitemap_tree).split() - actual_lines = str(actual_sitemap_tree).split() - diff = difflib.ndiff(expected_lines, actual_lines) - diff_str = "\n".join(diff) + expected_lines = str(expected_sitemap_tree).split() + actual_lines = str(actual_sitemap_tree).split() + diff = difflib.ndiff(expected_lines, actual_lines) + diff_str = "\n".join(diff) - assert expected_sitemap_tree == actual_sitemap_tree, diff_str + assert expected_sitemap_tree == actual_sitemap_tree, diff_str - assert len(list(actual_sitemap_tree.all_pages())) == 6 + assert len(list(actual_sitemap_tree.all_pages())) == 6 - def test_sitemap_tree_for_homepage_gzip(self): + def test_sitemap_tree_for_homepage_gzip(self, requests_mock): """Test sitemap_tree_for_homepage() with gzipped sitemaps.""" - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_1.gz + Sitemap: {self.TEST_BASE_URL}/sitemap_2.dat + Sitemap: {self.TEST_BASE_URL}/sitemap_3.xml.gz + """ + ).strip(), + ) + + # Gzipped sitemap without correct HTTP header but with .gz extension + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_1.gz", + content=gzip( + textwrap.dedent( f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap_1.gz - Sitemap: {self.TEST_BASE_URL}/sitemap_2.dat - Sitemap: {self.TEST_BASE_URL}/sitemap_3.xml.gz - """ - ).strip(), - ) - - # Gzipped sitemap without correct HTTP header but with .gz extension - m.get( - self.TEST_BASE_URL + "/sitemap_1.gz", - content=gzip( - textwrap.dedent( - f""" - <?xml version="1.0" encoding="UTF-8"?> - <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" - xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> - <url> - <loc>{self.TEST_BASE_URL}/news/foo.html</loc> - <news:news> - <news:publication> - <news:name>{self.TEST_PUBLICATION_NAME}</news:name> - <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> - </news:publication> - <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> - <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> - </news:news> - </url> - </urlset> - """ - ).strip() - ), - ) - - # Gzipped sitemap with correct HTTP header but without .gz extension - m.get( - self.TEST_BASE_URL + "/sitemap_2.dat", - headers={"Content-Type": "application/x-gzip"}, - content=gzip( - textwrap.dedent( - f""" - <?xml version="1.0" encoding="UTF-8"?> - <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" - xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> - <url> - <loc>{self.TEST_BASE_URL}/news/bar.html</loc> - <news:news> - <news:publication> - <news:name>{self.TEST_PUBLICATION_NAME}</news:name> - <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> - </news:publication> - <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> - <news:title><![CDATA[Bąr]]></news:title> <!-- CDATA and UTF-8 --> - </news:news> - </url> - </urlset> - """ - ).strip() - ), - ) + <?xml version="1.0" encoding="UTF-8"?> + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> + <url> + <loc>{self.TEST_BASE_URL}/news/foo.html</loc> + <news:news> + <news:publication> + <news:name>{self.TEST_PUBLICATION_NAME}</news:name> + <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> + </news:publication> + <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> + <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> + </news:news> + </url> + </urlset> + """ + ).strip() + ), + ) - # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't - m.get( - self.TEST_BASE_URL + "/sitemap_3.xml.gz", - headers={"Content-Type": "application/x-gzip"}, - text=textwrap.dedent( + # Gzipped sitemap with correct HTTP header but without .gz extension + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_2.dat", + headers={"Content-Type": "application/x-gzip"}, + content=gzip( + textwrap.dedent( f""" - <?xml version="1.0" encoding="UTF-8"?> - <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" - xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> - <url> - <loc>{self.TEST_BASE_URL}/news/baz.html</loc> - <news:news> - <news:publication> - <news:name>{self.TEST_PUBLICATION_NAME}</news:name> - <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> - </news:publication> - <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> - <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> - </news:news> - </url> - </urlset> - """ - ).strip(), - ) - - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) - - # Don't do an in-depth check, we just need to make sure that gunzip works - assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) - assert len(actual_sitemap_tree.sub_sitemaps) == 1 - - assert isinstance( - actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap - ) - # noinspection PyUnresolvedReferences - assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3 - - # noinspection PyUnresolvedReferences - sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] - assert isinstance(sitemap_1, PagesXMLSitemap) - assert len(sitemap_1.pages) == 1 - - # noinspection PyUnresolvedReferences - sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] - assert isinstance(sitemap_2, PagesXMLSitemap) - assert len(sitemap_2.pages) == 1 - - # noinspection PyUnresolvedReferences - sitemap_3 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[2] - assert isinstance(sitemap_3, PagesXMLSitemap) - assert len(sitemap_3.pages) == 1 - - def test_sitemap_tree_for_homepage_plain_text(self): + <?xml version="1.0" encoding="UTF-8"?> + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> + <url> + <loc>{self.TEST_BASE_URL}/news/bar.html</loc> + <news:news> + <news:publication> + <news:name>{self.TEST_PUBLICATION_NAME}</news:name> + <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> + </news:publication> + <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> + <news:title><![CDATA[Bąr]]></news:title> <!-- CDATA and UTF-8 --> + </news:news> + </url> + </urlset> + """ + ).strip() + ), + ) + + # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_3.xml.gz", + headers={"Content-Type": "application/x-gzip"}, + text=textwrap.dedent( + f""" + <?xml version="1.0" encoding="UTF-8"?> + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> + <url> + <loc>{self.TEST_BASE_URL}/news/baz.html</loc> + <news:news> + <news:publication> + <news:name>{self.TEST_PUBLICATION_NAME}</news:name> + <news:language>{self.TEST_PUBLICATION_LANGUAGE}</news:language> + </news:publication> + <news:publication_date>{self.TEST_DATE_STR_ISO8601}</news:publication_date> + <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> + </news:news> + </url> + </urlset> + """ + ).strip(), + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + # Don't do an in-depth check, we just need to make sure that gunzip works + assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) + assert len(actual_sitemap_tree.sub_sitemaps) == 1 + + assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) + # noinspection PyUnresolvedReferences + assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3 + + # noinspection PyUnresolvedReferences + sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] + assert isinstance(sitemap_1, PagesXMLSitemap) + assert len(sitemap_1.pages) == 1 + + # noinspection PyUnresolvedReferences + sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] + assert isinstance(sitemap_2, PagesXMLSitemap) + assert len(sitemap_2.pages) == 1 + + # noinspection PyUnresolvedReferences + sitemap_3 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[2] + assert isinstance(sitemap_3, PagesXMLSitemap) + assert len(sitemap_3.pages) == 1 + + def test_sitemap_tree_for_homepage_plain_text(self, requests_mock): """Test sitemap_tree_for_homepage() with plain text sitemaps.""" - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap_1.txt - Sitemap: {self.TEST_BASE_URL}/sitemap_2.txt.dat - """ - ).strip(), - ) + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_1.txt + Sitemap: {self.TEST_BASE_URL}/sitemap_2.txt.dat + """ + ).strip(), + ) + + # Plain text uncompressed sitemap (no Content-Type header) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_1.txt", + text=textwrap.dedent( + f""" + + {self.TEST_BASE_URL}/news/foo.html - # Plain text uncompressed sitemap (no Content-Type header) - m.get( - self.TEST_BASE_URL + "/sitemap_1.txt", - text=textwrap.dedent( + + {self.TEST_BASE_URL}/news/bar.html + + Some other stuff which totally doesn't look like an URL + """ + ).strip(), + ) + + # Plain text compressed sitemap without .gz extension + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_2.txt.dat", + headers={"Content-Type": "application/x-gzip"}, + content=gzip( + textwrap.dedent( f""" - - {self.TEST_BASE_URL}/news/foo.html - - - {self.TEST_BASE_URL}/news/bar.html - - Some other stuff which totally doesn't look like an URL - """ - ).strip(), - ) - - # Plain text compressed sitemap without .gz extension - m.get( - self.TEST_BASE_URL + "/sitemap_2.txt.dat", - headers={"Content-Type": "application/x-gzip"}, - content=gzip( - textwrap.dedent( - f""" - {self.TEST_BASE_URL}/news/bar.html - {self.TEST_BASE_URL}/news/baz.html - """ - ).strip() - ), - ) + {self.TEST_BASE_URL}/news/bar.html + {self.TEST_BASE_URL}/news/baz.html + """ + ).strip() + ), + ) - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) - assert len(actual_sitemap_tree.sub_sitemaps) == 1 + assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) + assert len(actual_sitemap_tree.sub_sitemaps) == 1 - assert isinstance( - actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap - ) - # noinspection PyUnresolvedReferences - assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2 + assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) + # noinspection PyUnresolvedReferences + assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2 - # noinspection PyUnresolvedReferences - sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] - assert isinstance(sitemap_1, PagesTextSitemap) - assert len(sitemap_1.pages) == 2 + # noinspection PyUnresolvedReferences + sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] + assert isinstance(sitemap_1, PagesTextSitemap) + assert len(sitemap_1.pages) == 2 - # noinspection PyUnresolvedReferences - sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] - assert isinstance(sitemap_2, PagesTextSitemap) - assert len(sitemap_2.pages) == 2 + # noinspection PyUnresolvedReferences + sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] + assert isinstance(sitemap_2, PagesTextSitemap) + assert len(sitemap_2.pages) == 2 - pages = list(actual_sitemap_tree.all_pages()) - assert len(pages) == 4 - assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/foo.html") in pages - assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/bar.html") in pages - assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/baz.html") in pages + pages = list(actual_sitemap_tree.all_pages()) + assert len(pages) == 4 + assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/foo.html") in pages + assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/bar.html") in pages + assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/baz.html") in pages # noinspection DuplicatedCode - def test_sitemap_tree_for_homepage_rss_atom(self): + def test_sitemap_tree_for_homepage_rss_atom(self, requests_mock): """Test sitemap_tree_for_homepage() with RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml + """ + ).strip(), + ) - Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml - Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml - Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml - """ - ).strip(), - ) - - # RSS 2.0 sitemap - m.get( - self.TEST_BASE_URL + "/sitemap_rss.xml", - headers={"Content-Type": "application/rss+xml"}, - text=textwrap.dedent( - f""" - <?xml version="1.0" encoding="UTF-8"?> - <rss version="2.0"> - <channel> - <title>Test RSS 2.0 feed - This is a test RSS 2.0 feed. - {self.TEST_BASE_URL} + # RSS 2.0 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_rss.xml", + headers={"Content-Type": "application/rss+xml"}, + text=textwrap.dedent( + f""" + + + + Test RSS 2.0 feed + This is a test RSS 2.0 feed. + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_RFC2822} + + + Test RSS 2.0 story #1 + This is a test RSS 2.0 story #1. + {self.TEST_BASE_URL}/rss_story_1.html + {self.TEST_BASE_URL}/rss_story_1.html {self.TEST_DATE_STR_RFC2822} + - - Test RSS 2.0 story #1 - This is a test RSS 2.0 story #1. - {self.TEST_BASE_URL}/rss_story_1.html - {self.TEST_BASE_URL}/rss_story_1.html - {self.TEST_DATE_STR_RFC2822} - - - - Test RSS 2.0 story #2 - This is a test RSS 2.0 story #2. - {self.TEST_BASE_URL}/rss_story_2.html - {self.TEST_BASE_URL}/rss_story_2.html - {self.TEST_DATE_STR_RFC2822} - - - - - """ - ).strip(), - ) - - # Atom 0.3 sitemap - m.get( - self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", - headers={"Content-Type": "application/atom+xml"}, - text=textwrap.dedent( - f""" - - - Test Atom 0.3 feed - - {self.TEST_DATE_STR_ISO8601} - - - Test Atom 0.3 story #1 - - {self.TEST_BASE_URL}/atom_0_3_story_1.html - {self.TEST_DATE_STR_ISO8601} - - - - Test Atom 0.3 story #2 - - {self.TEST_BASE_URL}/atom_0_3_story_2.html - {self.TEST_DATE_STR_ISO8601} - - - - """ - ).strip(), - ) - - # Atom 1.0 sitemap - m.get( - self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", - headers={"Content-Type": "application/atom+xml"}, - text=textwrap.dedent( - f""" - - - Test Atom 1.0 feed - This is a test Atom 1.0 feed. - - - {self.TEST_BASE_URL} + + Test RSS 2.0 story #2 + This is a test RSS 2.0 story #2. + {self.TEST_BASE_URL}/rss_story_2.html + {self.TEST_BASE_URL}/rss_story_2.html + {self.TEST_DATE_STR_RFC2822} + + + + + """ + ).strip(), + ) + + # Atom 0.3 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" + + + Test Atom 0.3 feed + + {self.TEST_DATE_STR_ISO8601} + + + Test Atom 0.3 story #1 + + {self.TEST_BASE_URL}/atom_0_3_story_1.html + {self.TEST_DATE_STR_ISO8601} + + + + Test Atom 0.3 story #2 + + {self.TEST_BASE_URL}/atom_0_3_story_2.html + {self.TEST_DATE_STR_ISO8601} + + + + """ + ).strip(), + ) + + # Atom 1.0 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" + + + Test Atom 1.0 feed + This is a test Atom 1.0 feed. + + + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_ISO8601} + + + Test Atom 1.0 story #1 + + + + {self.TEST_BASE_URL}/atom_1_0_story_1.html {self.TEST_DATE_STR_ISO8601} + This is test atom 1.0 story #1. + +
+

This is test atom 1.0 story #1.

+
+
+ + John Doe + johndoe@example.com + +
+ + + Test Atom 1.0 story #2 + + + + {self.TEST_BASE_URL}/atom_1_0_story_2.html + {self.TEST_DATE_STR_ISO8601} + This is test atom 1.0 story #2. + +
+

This is test atom 1.0 story #2.

+
+
+ + John Doe + johndoe@example.com + +
+ +
+ """ + ).strip(), + ) - - Test Atom 1.0 story #1 - - - - {self.TEST_BASE_URL}/atom_1_0_story_1.html - {self.TEST_DATE_STR_ISO8601} - This is test atom 1.0 story #1. - -
-

This is test atom 1.0 story #1.

-
-
- - John Doe - johndoe@example.com - -
- - - Test Atom 1.0 story #2 - - - - {self.TEST_BASE_URL}/atom_1_0_story_2.html - {self.TEST_DATE_STR_ISO8601} - This is test atom 1.0 story #2. - -
-

This is test atom 1.0 story #2.

-
-
- - John Doe - johndoe@example.com - -
- -
- """ - ).strip(), - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[ - PagesRSSSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/rss_story_1.html", - news_story=SitemapNewsStory( - title="Test RSS 2.0 story #1", - publish_date=self.TEST_DATE_DATETIME, - ), + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[ + PagesRSSSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/rss_story_1.html", + news_story=SitemapNewsStory( + title="Test RSS 2.0 story #1", + publish_date=self.TEST_DATE_DATETIME, ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/rss_story_2.html", - news_story=SitemapNewsStory( - title="Test RSS 2.0 story #2", - publish_date=self.TEST_DATE_DATETIME, - ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/rss_story_2.html", + news_story=SitemapNewsStory( + title="Test RSS 2.0 story #2", + publish_date=self.TEST_DATE_DATETIME, ), - ], - ), - PagesAtomSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/atom_0_3_story_1.html", - news_story=SitemapNewsStory( - title="Test Atom 0.3 story #1", - publish_date=self.TEST_DATE_DATETIME, - ), + ), + ], + ), + PagesAtomSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/atom_0_3_story_1.html", + news_story=SitemapNewsStory( + title="Test Atom 0.3 story #1", + publish_date=self.TEST_DATE_DATETIME, ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/atom_0_3_story_2.html", - news_story=SitemapNewsStory( - title="Test Atom 0.3 story #2", - publish_date=self.TEST_DATE_DATETIME, - ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/atom_0_3_story_2.html", + news_story=SitemapNewsStory( + title="Test Atom 0.3 story #2", + publish_date=self.TEST_DATE_DATETIME, ), - ], - ), - PagesAtomSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/atom_1_0_story_1.html", - news_story=SitemapNewsStory( - title="Test Atom 1.0 story #1", - publish_date=self.TEST_DATE_DATETIME, - ), + ), + ], + ), + PagesAtomSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/atom_1_0_story_1.html", + news_story=SitemapNewsStory( + title="Test Atom 1.0 story #1", + publish_date=self.TEST_DATE_DATETIME, ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/atom_1_0_story_2.html", - news_story=SitemapNewsStory( - title="Test Atom 1.0 story #2", - publish_date=self.TEST_DATE_DATETIME, - ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/atom_1_0_story_2.html", + news_story=SitemapNewsStory( + title="Test Atom 1.0 story #2", + publish_date=self.TEST_DATE_DATETIME, ), - ], - ), - ], - ) - ], - ) + ), + ], + ), + ], + ) + ], + ) - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - expected_lines = str(expected_sitemap_tree).split() - actual_lines = str(actual_sitemap_tree).split() - diff = difflib.ndiff(expected_lines, actual_lines) - diff_str = "\n".join(diff) + expected_lines = str(expected_sitemap_tree).split() + actual_lines = str(actual_sitemap_tree).split() + diff = difflib.ndiff(expected_lines, actual_lines) + diff_str = "\n".join(diff) - assert expected_sitemap_tree == actual_sitemap_tree, diff_str + assert expected_sitemap_tree == actual_sitemap_tree, diff_str - assert len(list(actual_sitemap_tree.all_pages())) == 6 + assert len(list(actual_sitemap_tree.all_pages())) == 6 - def test_sitemap_tree_for_homepage_rss_atom_empty(self): + def test_sitemap_tree_for_homepage_rss_atom_empty(self, requests_mock): """Test sitemap_tree_for_homepage() with empty RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml + """ + ).strip(), + ) - Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml - Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml - Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml - """ - ).strip(), - ) - - # RSS 2.0 sitemap - m.get( - self.TEST_BASE_URL + "/sitemap_rss.xml", - headers={"Content-Type": "application/rss+xml"}, - text=textwrap.dedent( - f""" - - - - Test RSS 2.0 feed - This is a test RSS 2.0 feed. - {self.TEST_BASE_URL} - {self.TEST_DATE_STR_RFC2822} - - - """ - ).strip(), - ) - - # Atom 0.3 sitemap - m.get( - self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", - headers={"Content-Type": "application/atom+xml"}, - text=textwrap.dedent( - f""" - - - Test Atom 0.3 feed - - {self.TEST_DATE_STR_ISO8601} - - """ - ).strip(), - ) - - # Atom 1.0 sitemap - m.get( - self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", - headers={"Content-Type": "application/atom+xml"}, - text=textwrap.dedent( - f""" - - - Test Atom 1.0 feed - This is a test Atom 1.0 feed. - - - {self.TEST_BASE_URL} - {self.TEST_DATE_STR_ISO8601} - - """ - ).strip(), - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[ - PagesRSSSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", - pages=[], - ), - PagesAtomSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", - pages=[], - ), - PagesAtomSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", - pages=[], - ), - ], - ) - ], - ) - - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) - - assert expected_sitemap_tree == actual_sitemap_tree - - assert len(list(actual_sitemap_tree.all_pages())) == 0 - - def test_sitemap_tree_for_homepage_prematurely_ending_xml(self): + # RSS 2.0 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_rss.xml", + headers={"Content-Type": "application/rss+xml"}, + text=textwrap.dedent( + f""" + + + + Test RSS 2.0 feed + This is a test RSS 2.0 feed. + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_RFC2822} + + + """ + ).strip(), + ) + + # Atom 0.3 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" + + + Test Atom 0.3 feed + + {self.TEST_DATE_STR_ISO8601} + + """ + ).strip(), + ) + + # Atom 1.0 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" + + + Test Atom 1.0 feed + This is a test Atom 1.0 feed. + + + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_ISO8601} + + """ + ).strip(), + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[ + PagesRSSSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", + pages=[], + ), + PagesAtomSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", + pages=[], + ), + PagesAtomSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", + pages=[], + ), + ], + ) + ], + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + assert expected_sitemap_tree == actual_sitemap_tree + + assert len(list(actual_sitemap_tree.all_pages())) == 0 + + def test_sitemap_tree_for_homepage_prematurely_ending_xml(self, requests_mock): """Test sitemap_tree_for_homepage() with clipped XML. Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the @@ -965,283 +944,266 @@ def test_sitemap_tree_for_homepage_prematurely_ending_xml(self): this behavior, so we have to support this too. """ - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap.xml - """ - ).strip(), - ) + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + """ + ).strip(), + ) - m.get( - self.TEST_BASE_URL + "/sitemap.xml", - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/first.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - First story - - - - {self.TEST_BASE_URL}/news/second.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - Second story - - - - - - {self.TEST_BASE_URL}/news/third.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - + + + {self.TEST_BASE_URL}/news/first.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + First story + + + + {self.TEST_BASE_URL}/news/second.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + Second story + + + + + + {self.TEST_BASE_URL}/news/third.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + - - - {self.TEST_BASE_URL}/news/public.html - - - """ - ).strip(), - ) + # Public sitemap (linked to from robots.txt) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_public.xml", + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/public.html + + + """ + ).strip(), + ) - # Private sitemap (to be discovered by trying out a few paths) - m.get( - self.TEST_BASE_URL + "/sitemap_index.xml", - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/private.html - - - """ - ).strip(), - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[ - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_public.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/public.html", - ), - ], - ), - ], - ), - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_index.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/private.html", - ), - ], - ), - ], - ) - - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) - - assert expected_sitemap_tree == actual_sitemap_tree - - def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self): + # Private sitemap (to be discovered by trying out a few paths) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_index.xml", + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/private.html + + + """ + ).strip(), + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[ + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_public.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/public.html", + ), + ], + ), + ], + ), + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_index.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/private.html", + ), + ], + ), + ], + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + assert expected_sitemap_tree == actual_sitemap_tree + + def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self, requests_mock): """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt.""" - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": ""}, - text=textwrap.dedent( - """ - User-agent: * - Disallow: /whatever - """.format() - ).strip(), - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[], - ) - ], - ) - - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) - - assert expected_sitemap_tree == actual_sitemap_tree - - def test_sitemap_tree_for_homepage_no_robots_txt(self): + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": ""}, + text=textwrap.dedent( + """ + User-agent: * + Disallow: /whatever + """.format() + ).strip(), + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[], + ) + ], + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + assert expected_sitemap_tree == actual_sitemap_tree + + def test_sitemap_tree_for_homepage_no_robots_txt(self, requests_mock): """Test sitemap_tree_for_homepage() with no robots.txt.""" - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) - # Nonexistent robots.txt - m.get( - self.TEST_BASE_URL + "/robots.txt", - status_code=404, - reason="Not Found", - headers={"Content-Type": "text/html"}, - text="

404 Not Found!

", - ) + # Nonexistent robots.txt + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + status_code=404, + reason="Not Found", + headers={"Content-Type": "text/html"}, + text="

404 Not Found!

", + ) - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[], - ) + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[], + ) - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - assert expected_sitemap_tree == actual_sitemap_tree + assert expected_sitemap_tree == actual_sitemap_tree - def test_sitemap_tree_for_homepage_huge_sitemap(self): + def test_sitemap_tree_for_homepage_huge_sitemap(self, requests_mock): """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling).""" page_count = 1000 @@ -1277,90 +1239,84 @@ def test_sitemap_tree_for_homepage_huge_sitemap(self): sitemap_xml += "
" - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap.xml.gz - """ - ).strip(), - ) + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap.xml.gz + """ + ).strip(), + ) - m.get( - self.TEST_BASE_URL + "/sitemap.xml.gz", - headers={"Content-Type": "application/x-gzip"}, - content=gzip(sitemap_xml), - ) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml.gz", + headers={"Content-Type": "application/x-gzip"}, + content=gzip(sitemap_xml), + ) - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - assert len(list(actual_sitemap_tree.all_pages())) == page_count + assert len(list(actual_sitemap_tree.all_pages())) == page_count - def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self): + def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self, requests_mock): """Test sitemap_tree_for_homepage() with weird (but valid) spacing.""" - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) - robots_txt_body = "" - robots_txt_body += "User-agent: *\n" - # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL - robots_txt_body += f" Sitemap:{self.TEST_BASE_URL}/sitemap.xml " + robots_txt_body = "" + robots_txt_body += "User-agent: *\n" + # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL + robots_txt_body += f" Sitemap:{self.TEST_BASE_URL}/sitemap.xml " - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=robots_txt_body, - ) + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=robots_txt_body, + ) - m.get( - self.TEST_BASE_URL + "/sitemap.xml", - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/first.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - First story - - - - """ - ).strip(), - ) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/first.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + First story + + + + """ + ).strip(), + ) - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) - assert len(list(actual_sitemap_tree.all_pages())) == 1 + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + assert len(list(actual_sitemap_tree.all_pages())) == 1 - def test_sitemap_tree_for_homepage_utf8_bom(self): + def test_sitemap_tree_for_homepage_utf8_bom(self, requests_mock): """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap.""" robots_txt_body = textwrap.dedent( @@ -1395,26 +1351,23 @@ def test_sitemap_tree_for_homepage_utf8_bom(self): robots_txt_body_encoded = robots_txt_body.encode("utf-8-sig") sitemap_xml_body_encoded = sitemap_xml_body.encode("utf-8-sig") - with requests_mock.Mocker() as m: - m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - m.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - m.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - content=robots_txt_body_encoded, - ) - - m.get( - self.TEST_BASE_URL + "/sitemap.xml", - content=sitemap_xml_body_encoded, - ) - - actual_sitemap_tree = sitemap_tree_for_homepage( - homepage_url=self.TEST_BASE_URL - ) - assert len(list(actual_sitemap_tree.all_pages())) == 1 + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + content=robots_txt_body_encoded, + ) + + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + content=sitemap_xml_body_encoded, + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + assert len(list(actual_sitemap_tree.all_pages())) == 1 diff --git a/usp/objects/sitemap.py b/usp/objects/sitemap.py index 612b537..0cac96a 100644 --- a/usp/objects/sitemap.py +++ b/usp/objects/sitemap.py @@ -262,7 +262,7 @@ def all_pages(self) -> Iterator[SitemapPage]: :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). """ for sub_sitemap in self.sub_sitemaps: - yield from self.all_pages() + yield from sub_sitemap.all_pages() class IndexWebsiteSitemap(AbstractIndexSitemap): From bd64b62071f64bb0b48c95fa987a08d767150d7f Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 17 Aug 2024 10:20:34 +0100 Subject: [PATCH 27/79] Convert requests tests to use fixtures --- tests/web_client/test_requests_client.py | 156 +++++++++++------------ 1 file changed, 73 insertions(+), 83 deletions(-) diff --git a/tests/web_client/test_requests_client.py b/tests/web_client/test_requests_client.py index 450545c..0a3e1a2 100644 --- a/tests/web_client/test_requests_client.py +++ b/tests/web_client/test_requests_client.py @@ -1,9 +1,8 @@ import re import socket from http import HTTPStatus -from unittest import TestCase -import requests_mock +import pytest from usp.__about__ import __version__ from usp.web_client.abstract_client import ( @@ -13,84 +12,76 @@ from usp.web_client.requests_client import RequestsWebClient -class TestRequestsClient(TestCase): +class TestRequestsClient: TEST_BASE_URL = "http://test-ultimate-sitemap-parser.com" # mocked by HTTPretty TEST_CONTENT_TYPE = "text/html" - __slots__ = [ - "__client", - ] + @pytest.fixture + def client(self): + return RequestsWebClient() - def setUp(self) -> None: - super().setUp() + def test_get(self, client, requests_mock): + test_url = self.TEST_BASE_URL + "/" + test_content = "This is a homepage." - self.__client = RequestsWebClient() - - def test_get(self): - with requests_mock.Mocker() as m: - test_url = self.TEST_BASE_URL + "/" - test_content = "This is a homepage." - - m.get( - test_url, - headers={"Content-Type": self.TEST_CONTENT_TYPE}, - text=test_content, - ) - - response = self.__client.get(test_url) - - assert response - assert isinstance(response, AbstractWebClientSuccessResponse) - assert response.status_code() == HTTPStatus.OK.value - assert response.status_message() == HTTPStatus.OK.phrase - assert response.header("Content-Type") == self.TEST_CONTENT_TYPE - assert response.header("content-type") == self.TEST_CONTENT_TYPE - assert response.header("nonexistent") is None - assert response.raw_data().decode("utf-8") == test_content - - def test_get_user_agent(self): - with requests_mock.Mocker() as m: - test_url = self.TEST_BASE_URL + "/" + requests_mock.get( + test_url, + headers={"Content-Type": self.TEST_CONTENT_TYPE}, + text=test_content, + ) - def content_user_agent(request, context): - context.status_code = HTTPStatus.OK.value - return request.headers.get("User-Agent", "unknown") + response = client.get(test_url) - m.get( - test_url, - text=content_user_agent, - ) + assert response + assert isinstance(response, AbstractWebClientSuccessResponse) + assert response.status_code() == HTTPStatus.OK.value + assert response.status_message() == HTTPStatus.OK.phrase + assert response.header("Content-Type") == self.TEST_CONTENT_TYPE + assert response.header("content-type") == self.TEST_CONTENT_TYPE + assert response.header("nonexistent") is None + assert response.raw_data().decode("utf-8") == test_content + + def test_get_user_agent(self, client, requests_mock): + test_url = self.TEST_BASE_URL + "/" + + def content_user_agent(request, context): + context.status_code = HTTPStatus.OK.value + return request.headers.get("User-Agent", "unknown") + + requests_mock.get( + test_url, + text=content_user_agent, + ) - response = self.__client.get(test_url) + response = client.get(test_url) - assert response - assert isinstance(response, AbstractWebClientSuccessResponse) + assert response + assert isinstance(response, AbstractWebClientSuccessResponse) - content = response.raw_data().decode("utf-8") - assert content == f"ultimate_sitemap_parser/{__version__}" + content = response.raw_data().decode("utf-8") + assert content == f"ultimate_sitemap_parser/{__version__}" - def test_get_not_found(self): - with requests_mock.Mocker() as m: - test_url = self.TEST_BASE_URL + "/404.html" + def test_get_not_found(self, client, requests_mock): + test_url = self.TEST_BASE_URL + "/404.html" - m.get( - test_url, - status_code=HTTPStatus.NOT_FOUND.value, - reason=HTTPStatus.NOT_FOUND.phrase, - headers={"Content-Type": self.TEST_CONTENT_TYPE}, - text="This page does not exist.", - ) + requests_mock.get( + test_url, + status_code=HTTPStatus.NOT_FOUND.value, + reason=HTTPStatus.NOT_FOUND.phrase, + headers={"Content-Type": self.TEST_CONTENT_TYPE}, + text="This page does not exist.", + ) - response = self.__client.get(test_url) + response = client.get(test_url) - assert response - assert isinstance(response, WebClientErrorResponse) - assert response.retryable() is False + assert response + assert isinstance(response, WebClientErrorResponse) + assert response.retryable() is False - def test_get_nonexistent_domain(self): + def test_get_nonexistent_domain(self, client): test_url = "http://www.totallydoesnotexisthjkfsdhkfsd.com/some_page.html" - response = self.__client.get(test_url) + response = client.get(test_url) assert response assert isinstance(response, WebClientErrorResponse) @@ -102,7 +93,7 @@ def test_get_nonexistent_domain(self): is not None ) - def test_get_timeout(self): + def test_get_timeout(self, client): sock = socket.socket() sock.bind(("", 0)) socket_port = sock.getsockname()[1] @@ -112,9 +103,9 @@ def test_get_timeout(self): test_timeout = 1 test_url = f"http://127.0.0.1:{socket_port}/slow_page.html" - self.__client.set_timeout(test_timeout) + client.set_timeout(test_timeout) - response = self.__client.get(test_url) + response = client.get(test_url) sock.close() @@ -123,26 +114,25 @@ def test_get_timeout(self): assert response.retryable() is True assert "Read timed out" in response.message() - def test_get_max_response_data_length(self): - with requests_mock.Mocker() as m: - actual_length = 1024 * 1024 - max_length = 1024 * 512 + def test_get_max_response_data_length(self, client, requests_mock): + actual_length = 1024 * 1024 + max_length = 1024 * 512 - test_url = self.TEST_BASE_URL + "/huge_page.html" - test_content = "a" * actual_length + test_url = self.TEST_BASE_URL + "/huge_page.html" + test_content = "a" * actual_length - m.get( - test_url, - headers={"Content-Type": self.TEST_CONTENT_TYPE}, - text=test_content, - ) + requests_mock.get( + test_url, + headers={"Content-Type": self.TEST_CONTENT_TYPE}, + text=test_content, + ) - self.__client.set_max_response_data_length(max_length) + client.set_max_response_data_length(max_length) - response = self.__client.get(test_url) + response = client.get(test_url) - assert response - assert isinstance(response, AbstractWebClientSuccessResponse) + assert response + assert isinstance(response, AbstractWebClientSuccessResponse) - response_length = len(response.raw_data()) - assert response_length == max_length + response_length = len(response.raw_data()) + assert response_length == max_length From 79f35224720ea67f606649650fc13e66b222df81 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sun, 18 Aug 2024 14:24:36 +0100 Subject: [PATCH 28/79] Add integration tests and improve performance --- .gitignore | 5 +- poetry.lock | 775 ++++++++++++++++++++++++- pyproject.toml | 18 +- tests/conftest.py | 0 tests/integration/README.md | 51 ++ tests/integration/cassettes/.gitignore | 2 + tests/integration/conftest.py | 20 + tests/integration/test_integration.py | 42 ++ usp/fetch_parse.py | 19 +- usp/helpers.py | 25 +- 10 files changed, 944 insertions(+), 13 deletions(-) delete mode 100644 tests/conftest.py create mode 100644 tests/integration/README.md create mode 100644 tests/integration/cassettes/.gitignore create mode 100644 tests/integration/conftest.py create mode 100644 tests/integration/test_integration.py diff --git a/.gitignore b/.gitignore index 5e5c966..7132567 100644 --- a/.gitignore +++ b/.gitignore @@ -114,4 +114,7 @@ dmypy.json # Pyre type checker .pyre/ -.idea/ \ No newline at end of file +.idea/ + +# Memray reports +memray/ \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 70727a3..9973c30 100644 --- a/poetry.lock +++ b/poetry.lock @@ -157,6 +157,325 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "jinja2" +version = "3.1.4" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +files = [ + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "linkify-it-py" +version = "2.0.3" +description = "Links recognition library with FULL unicode support." +optional = false +python-versions = ">=3.7" +files = [ + {file = "linkify-it-py-2.0.3.tar.gz", hash = "sha256:68cda27e162e9215c17d786649d1da0021a451bdc436ef9e0fa0ba5234b9b048"}, + {file = "linkify_it_py-2.0.3-py3-none-any.whl", hash = "sha256:6bcbc417b0ac14323382aef5c5192c0075bf8a9d6b41820a2b66371eac6b6d79"}, +] + +[package.dependencies] +uc-micro-py = "*" + +[package.extras] +benchmark = ["pytest", "pytest-benchmark"] +dev = ["black", "flake8", "isort", "pre-commit", "pyproject-flake8"] +doc = ["myst-parser", "sphinx", "sphinx-book-theme"] +test = ["coverage", "pytest", "pytest-cov"] + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[package.dependencies] +linkify-it-py = {version = ">=1,<3", optional = true, markers = "extra == \"linkify\""} +mdit-py-plugins = {version = "*", optional = true, markers = "extra == \"plugins\""} +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "markupsafe" +version = "2.1.5" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.7" +files = [ + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"}, + {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, +] + +[[package]] +name = "mdit-py-plugins" +version = "0.4.1" +description = "Collection of plugins for markdown-it-py" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mdit_py_plugins-0.4.1-py3-none-any.whl", hash = "sha256:1020dfe4e6bfc2c79fb49ae4e3f5b297f5ccd20f010187acc52af2921e27dc6a"}, + {file = "mdit_py_plugins-0.4.1.tar.gz", hash = "sha256:834b8ac23d1cd60cec703646ffd22ae97b7955a6d596eb1d304be1e251ae499c"}, +] + +[package.dependencies] +markdown-it-py = ">=1.0.0,<4.0.0" + +[package.extras] +code-style = ["pre-commit"] +rtd = ["myst-parser", "sphinx-book-theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + +[[package]] +name = "memray" +version = "1.13.4" +description = "A memory profiler for Python applications" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "memray-1.13.4-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:ed0bfcffbd857cbf78a4db942019e9e153019b754048b0522065844d1c538e8c"}, + {file = "memray-1.13.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fcf71802b2c6d68c5336b1e4ae341eab64dcccd0dcf67687af53f18bc020237b"}, + {file = "memray-1.13.4-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6c9ae675131492bdfafcc44e86d0b81401ea8d052a9cab7793b1dab642cd58e6"}, + {file = "memray-1.13.4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bac9d30ce39aaee40601087d09c1639a071293f414b5e726a152ed3581d25e50"}, + {file = "memray-1.13.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a437c7e28734028a2f43f942c3146e9737033718cea092ea910f6de3cf46221d"}, + {file = "memray-1.13.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:3cae161d5b6769cc3af574cfa0c7ea77f98d6ae714ba5ec508f6f05b84800801"}, + {file = "memray-1.13.4-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:bf407123e175de4f5a7264886eb64ea514f4b388b617f05dfcd857d99ecadd1c"}, + {file = "memray-1.13.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a6f1bd3d0adf84f864e24f74552c1533224e64283dfee33641011acf384fc138"}, + {file = "memray-1.13.4-cp311-cp311-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ba5bb9a3b7c3c08752f3b55a3b5b360963c9f666e2220eb388ab6f7d1271d843"}, + {file = "memray-1.13.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1e8cec70e51e81c0e9448e62a5366914b74a3dbb60826cdec8f0e7559e58e74"}, + {file = "memray-1.13.4-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:81497e578017feb57a46e19c349450888e57ff7fb8f0f5134d3e07605c435500"}, + {file = "memray-1.13.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e585d866c82ce92060fa1c925298aa8b89936ca22df9698a25a5f0cf7ca81fa2"}, + {file = "memray-1.13.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3d048da01dc138711a2c9c70ba693d186690c98fb0ca26fdc3483486d4849238"}, + {file = "memray-1.13.4-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:b6459761046ab46638d2c62d7f3f55eaaf45a947bd1d36dcfb5e860047280557"}, + {file = "memray-1.13.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:637651f5ca2870e9156f189c337e8c6d0002e3f6f7d44d6486ff5baf12a6115e"}, + {file = "memray-1.13.4-cp312-cp312-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d5b9e10fde6f652ea176cbc0d4d4c563d2831faec4434d3e03c4c0aff8ddc6c0"}, + {file = "memray-1.13.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1f3ab803b703b9be29259039caf43803ad5abf37f04e77cd9e8373054dd91f6"}, + {file = "memray-1.13.4-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfdc070da2df9241f78b7429d44f6ee16e924d43eddc587f6ed7218c4cb792d3"}, + {file = "memray-1.13.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:523a63dee71cd4d55eddca866244a045e7549ca5137ec906c62893b87a2161ce"}, + {file = "memray-1.13.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3bf06f8883a26b779cc828addad97a2d39d7587263e348655dae3ec90b6ee079"}, + {file = "memray-1.13.4-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ee47b798d5712faa2a38ff60b5c77f73ed8342709e15bd6ed3a46edd9feee089"}, + {file = "memray-1.13.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:daed9ae6ceea6a4dcc96ac62cfa00d90f7365ed54ee90886e5287251855a2c02"}, + {file = "memray-1.13.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc194c5564f5b33d33bbbeca54b0e3f637d985a144c3551f887bf668d896ffec"}, + {file = "memray-1.13.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:554cda50101a14e1bca2453bd1013949cdc4ebcd4709065139955578f8b0c354"}, + {file = "memray-1.13.4-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e3e261995acc40f8bd4a10740b25f37eccc3e413449ceb73263216ec3c6fee7b"}, + {file = "memray-1.13.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5e8ef7cc755a5a3a4b5d9991cf70428220e9138bc5967b5764b2fa7a1fb4d7a6"}, + {file = "memray-1.13.4-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7c8ea410c842a31ee07e76b409c273d23e0b670d166c2330488d1164ab42a4d1"}, + {file = "memray-1.13.4-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:22c9656c78a50b4569a8273d062cc6c139e9f7cfec3f793e60ef7e0cb46c2a76"}, + {file = "memray-1.13.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7096401ff3b7ce5f759e45bd71b0899ad50a607924ee23ad02423aa4fa505f52"}, + {file = "memray-1.13.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2072a454846e055d5833d17b1f5e6106ae18e425974c3968276a1952b0156f2b"}, + {file = "memray-1.13.4-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:f19fb4a20d796af869691531d2d6540d27bfbf0df9118bd17e547a23224a8b3e"}, + {file = "memray-1.13.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b1331de55aab1c409ec295b155770487004e0c6d08db0fe903e9377be978bacd"}, + {file = "memray-1.13.4-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f43bfa96114012d76025a0ee42206c7d914b6113c491dc221e1b7a901c51242a"}, + {file = "memray-1.13.4-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e0cc6299a09d3c51fd1a45bd816e9dd68a7092a3eaf62aeaa2ee01068f2a1d0b"}, + {file = "memray-1.13.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4cbaa39bf7041e32282ad84a9fd9bd98622f78f649ccb7d1075382fed84e8125"}, + {file = "memray-1.13.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59bba975fbbced85fe6f61685aef459777faf3b8a5b4f0de4fa5b99cf6f1a5e7"}, + {file = "memray-1.13.4.tar.gz", hash = "sha256:48f8f9b89b3a84028668244151eb7248189fb3f4f2a761ec1211439adcbb2ad1"}, +] + +[package.dependencies] +jinja2 = ">=2.9" +rich = ">=11.2.0" +textual = ">=0.41.0" + +[package.extras] +benchmark = ["asv"] +dev = ["Cython", "asv", "black", "bump2version", "check-manifest", "flake8", "furo", "greenlet", "ipython", "isort", "mypy", "pytest", "pytest-cov", "pytest-textual-snapshot", "setuptools", "sphinx", "sphinx-argparse", "textual (>=0.43,!=0.65.2,!=0.66)", "towncrier"] +docs = ["IPython", "bump2version", "furo", "sphinx", "sphinx-argparse", "towncrier"] +lint = ["black", "check-manifest", "flake8", "isort", "mypy"] +test = ["Cython", "greenlet", "ipython", "pytest", "pytest-cov", "pytest-textual-snapshot", "setuptools", "textual (>=0.43,!=0.65.2,!=0.66)"] + +[[package]] +name = "multidict" +version = "6.0.5" +description = "multidict implementation" +optional = false +python-versions = ">=3.7" +files = [ + {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"}, + {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"}, + {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"}, + {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"}, + {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"}, + {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"}, + {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"}, + {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"}, + {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"}, + {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"}, + {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e"}, + {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b"}, + {file = "multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda"}, + {file = "multidict-6.0.5-cp312-cp312-win32.whl", hash = "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5"}, + {file = "multidict-6.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556"}, + {file = "multidict-6.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc"}, + {file = "multidict-6.0.5-cp37-cp37m-win32.whl", hash = "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee"}, + {file = "multidict-6.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423"}, + {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54"}, + {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d"}, + {file = "multidict-6.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44"}, + {file = "multidict-6.0.5-cp38-cp38-win32.whl", hash = "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241"}, + {file = "multidict-6.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c"}, + {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"}, + {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"}, + {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"}, + {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"}, + {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"}, + {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"}, + {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"}, +] + [[package]] name = "packaging" version = "24.1" @@ -183,6 +502,97 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "pygments" +version = "2.18.0" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"}, + {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + +[[package]] +name = "pyinstrument" +version = "4.7.2" +description = "Call stack profiler for Python. Shows you why your code is slow!" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyinstrument-4.7.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a316a929a29e4fb1c0a122c503e9442580daf485be20bd713fcc60b98bb48509"}, + {file = "pyinstrument-4.7.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:50c56106e4b3a92dbf1c9d36b307cf67c5b667ae35195d41cf1ded7afc26a01a"}, + {file = "pyinstrument-4.7.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:528b6c8267ebe114d04c8e189f80907b6af9e7a7d6a6597f2833ddcfedbde66f"}, + {file = "pyinstrument-4.7.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f856e7edd39f73d7a68180f03133fc7c6331d3849b8db4d480028c36433ab46"}, + {file = "pyinstrument-4.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6f28831c8386bf820d014282c2e8748049819f61eacb210029fd7e08f45df37"}, + {file = "pyinstrument-4.7.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:78735eb3822746fd12f37ab9a84df35b613b9824b0f8819529c41d9aa09c26c6"}, + {file = "pyinstrument-4.7.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:03dfecfcb7d699b7d8f9d36fb6a11c476233a71eeea78b466c69bca300029603"}, + {file = "pyinstrument-4.7.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b9bd25ba7ef070f538c5e3c6b4a991ce6837a6a2c49c4feba10cb8f5f60182f4"}, + {file = "pyinstrument-4.7.2-cp310-cp310-win32.whl", hash = "sha256:fee18be41331fe0a016c315ea36da4ce965d1fdba051edad16823771e4a0c03d"}, + {file = "pyinstrument-4.7.2-cp310-cp310-win_amd64.whl", hash = "sha256:1a73eb6c07b8c52b976b8a0029dc3dfee83c487f640e97c4b84fcf15cda91caa"}, + {file = "pyinstrument-4.7.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:19c51585e93482cdef7d627f8210f6272d357bf298b6ebd9761bdc2cf50f1b30"}, + {file = "pyinstrument-4.7.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:201eb2460f815efda749a659bf4315d27e964a522c83e04173a052ce89de06d4"}, + {file = "pyinstrument-4.7.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:518f7fbb0f05377391b72e72e8d6942d6413a0d36df0e77a4625b6cbd4ce84fc"}, + {file = "pyinstrument-4.7.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dc1ae87dc6ba8e7fad7ef70996a94a9fd63d5c5c8daa86eb9bc3b2e87f6733a"}, + {file = "pyinstrument-4.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a340ef24718228c57f49750dcac68db1f7d1c9c4d3ce004d3c154f464bacb3d1"}, + {file = "pyinstrument-4.7.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:85e441fcb06d087ae836551dee6a9a9bacf12b0a0c9a6e956376e7c779190474"}, + {file = "pyinstrument-4.7.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fa1f4c0fd2cb118fea3e6d8ba5fcaa9b51c92344841935a7c2c4a8964647273e"}, + {file = "pyinstrument-4.7.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1c8a500c7d077bba643fb3c12fc810f7e1f15fbf37d418cb751f1ee98e275ce6"}, + {file = "pyinstrument-4.7.2-cp311-cp311-win32.whl", hash = "sha256:aa8818f465ed4a6fbe6a2dd59589cc8087fd7ea5faebc32b45c1cb3eb27cfd36"}, + {file = "pyinstrument-4.7.2-cp311-cp311-win_amd64.whl", hash = "sha256:ef64820320ab78f0ce0992104cb7d343ffbb199c015f163fbdc2c66cb3215347"}, + {file = "pyinstrument-4.7.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:10e39476dad9751f2e88a77e50eb5466d16701d9b4efc507a3addce24d1ef43e"}, + {file = "pyinstrument-4.7.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7077831b06d9fec49a92100c8dfd237e1a4c363183746d5a9d44c0174c587547"}, + {file = "pyinstrument-4.7.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2100cf016ee71be21d209d3003ce0dfdac8d74e5e45b9f9ae0a3cfceef7360a"}, + {file = "pyinstrument-4.7.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b00caeff2a7971752a428f9690a337a97ebbdbf14c0f05280b0a4176efd321c"}, + {file = "pyinstrument-4.7.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35dad76e54f0b94f4407579740d91d413ddbc471b465da3782ffa85a87180cbd"}, + {file = "pyinstrument-4.7.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6e6c95ff1e05661457d3f53985a23579cec9fd23639af271fd238ddd545562d4"}, + {file = "pyinstrument-4.7.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:685e998538ba2145fbfe4428534f1cabb5b5719cd5454fbc88c3ab043f2267cb"}, + {file = "pyinstrument-4.7.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0f43db19d1bb923b8b4b50f1d95994151cb04e848acd4740238e3805e87825c3"}, + {file = "pyinstrument-4.7.2-cp312-cp312-win32.whl", hash = "sha256:ef63b4157bf245a2b9543fa71cec71116a4e19c2a6a6ad96623d7b85eaa32119"}, + {file = "pyinstrument-4.7.2-cp312-cp312-win_amd64.whl", hash = "sha256:140203d90e89a06dad86b07cb8d9ab1d763ddc1332502839daac19ff6360ae84"}, + {file = "pyinstrument-4.7.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2df465b065435152473b7c4d0b80c05d3136769251fd7fe725cfcb6eb87340fa"}, + {file = "pyinstrument-4.7.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:50023b396289a27ea5d2f60d78bdeec7e4ccc6051038dfd7f5638c15a314a5d5"}, + {file = "pyinstrument-4.7.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:065451fed990ad050b0fdb4a2bd5f28426f5c5f4b94bd8dab9d144079e073761"}, + {file = "pyinstrument-4.7.2-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:017788c61627f74c3ea503198628bccc46a87e421a282dfb055ff4500026748f"}, + {file = "pyinstrument-4.7.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8df61a879c7316f31791018c92f8cca92cd4dc5a624e629c3d969d77a3657fb"}, + {file = "pyinstrument-4.7.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:656910a5fbb7b99232f8f835815cdf69734b229434c26380c29a0ef09ec9874d"}, + {file = "pyinstrument-4.7.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c2337616952ec3bd35dedb9a1ed396a3accfc0305bc54e22179e77fe63d50909"}, + {file = "pyinstrument-4.7.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ea4e4e7a8ea9a042fa2c4e0efc00d87b29e0af4a1a0b3dba907c3c63cdde4510"}, + {file = "pyinstrument-4.7.2-cp313-cp313-win32.whl", hash = "sha256:24012bc0e5a507189f5f1caa01b4589bb286348e929df6a898c926ffd6e5238a"}, + {file = "pyinstrument-4.7.2-cp313-cp313-win_amd64.whl", hash = "sha256:3d8eaf57bc447b8e108b5d684b371c64232d9895b06a097d8dc2b92f3fdde561"}, + {file = "pyinstrument-4.7.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3cfa57f2a94a52fb3a3e66e910f753b6fd954e20c12407b8e80cc8e50733f771"}, + {file = "pyinstrument-4.7.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4e9a5344b9e8a2748ba610502e7fa951d494591f8e5d8337100108f94bd73e30"}, + {file = "pyinstrument-4.7.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea9af525ce70e9d391b321015e3ef24cccf4df8c51c692492cade49e440b17c2"}, + {file = "pyinstrument-4.7.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b05d17721f99e7356e540a3be84bcad2c4f74144fe3a52d74a7da149f44d03d"}, + {file = "pyinstrument-4.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08581cb58877716d1839950ff0d474516ae743c575dff051babfb066e9c38405"}, + {file = "pyinstrument-4.7.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ad5b688488cab71b601e0aaefd726029f6ddc05525995424387fa88c6f1ce365"}, + {file = "pyinstrument-4.7.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5704125a8b8a0c0d98716d207e1882dfd90fe6c37bf6ac0055b671e43bb13b27"}, + {file = "pyinstrument-4.7.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d704ec91a774066c4a1d1f20046a00e1ef80f50ba9d024919e62365d84b55bdd"}, + {file = "pyinstrument-4.7.2-cp38-cp38-win32.whl", hash = "sha256:6969676c30ce6e078d453a232b074476e32506c5b30a44fc7847cbfe1cb8674f"}, + {file = "pyinstrument-4.7.2-cp38-cp38-win_amd64.whl", hash = "sha256:b6504d60875443bee1f8c31517832b6c054ac0389b745a897484ea1e7edeec5c"}, + {file = "pyinstrument-4.7.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a483be96c025e0287125aad85be3a0bee8687f069e422fb29eab49dd3d53a53d"}, + {file = "pyinstrument-4.7.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ac0caa72765e8f068ad92e9c24c45cf0f4e31c902f403e264199a5667a2e034"}, + {file = "pyinstrument-4.7.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8279d811e86afab5bc31e4aa4f3310b8c5b83682d52cfabee990a9f6a67cd551"}, + {file = "pyinstrument-4.7.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d5b24a14d0fc74e6d9e471088936593cd9f55bb1bfd502e7801913e9d14308e"}, + {file = "pyinstrument-4.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbf90a6b86313ca01b85909e93fb5aaa7a26422a0c6347a07e249b381e77219e"}, + {file = "pyinstrument-4.7.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:e9a96dcbdb272a389fbecb28a5916fab09d2d1a515c997e7bed08c68d5835fbe"}, + {file = "pyinstrument-4.7.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:029855d9bd6bdf66b1948d697261446f049af0b576f0f4b9c2bb5a741a15fefc"}, + {file = "pyinstrument-4.7.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b0331ff6984642a0f66be9e4a66331f1a401948b8bf89ed60990f229fbd10432"}, + {file = "pyinstrument-4.7.2-cp39-cp39-win32.whl", hash = "sha256:4db19ffbb0047e00c6d444ac0e648505982399361aa609b3af9229a971dca79e"}, + {file = "pyinstrument-4.7.2-cp39-cp39-win_amd64.whl", hash = "sha256:b174abcc7438f8aa20a190fcafd8eba099af54af445ce5ea1b28b25750f59652"}, + {file = "pyinstrument-4.7.2.tar.gz", hash = "sha256:8c4e4792e7bc2de6ad757dcb05bb6739b5aed64f834602e8121f611e3278e0d1"}, +] + +[package.extras] +bin = ["click", "nox"] +docs = ["furo (==2024.7.18)", "myst-parser (==3.0.1)", "sphinx (==7.4.7)", "sphinx-autobuild (==2024.4.16)", "sphinxcontrib-programoutput (==0.17)"] +examples = ["django", "litestar", "numpy"] +test = ["cffi (>=v1.17.0rc1)", "flaky", "greenlet (>=3.0.0a1)", "ipython", "pytest", "pytest-asyncio (==0.23.8)", "trio"] +types = ["typing-extensions"] + [[package]] name = "pytest" version = "8.3.2" @@ -205,6 +615,26 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-memray" +version = "1.7.0" +description = "A simple plugin to use with pytest" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest_memray-1.7.0-py3-none-any.whl", hash = "sha256:b896718c1adf6d0cd339dfaaaa5620f035c9919e1199a79b3453804a1254306f"}, + {file = "pytest_memray-1.7.0.tar.gz", hash = "sha256:c18fa907d2210b42f4096c093e2d3416dfc002dcaa450ef3f9ba819bc3dd8f5f"}, +] + +[package.dependencies] +memray = ">=1.12" +pytest = ">=7.2" + +[package.extras] +docs = ["furo (>=2022.12.7)", "sphinx (>=6.1.3)", "sphinx-argparse (>=0.4)", "sphinx-inline-tabs (>=2022.1.2b11)", "sphinxcontrib-programoutput (>=0.17)", "towncrier (>=22.12)"] +lint = ["black (==22.12)", "isort (==5.11.4)", "mypy (==0.991)", "ruff (==0.0.272)"] +test = ["anyio (>=4.4.0)", "covdefaults (>=2.2.2)", "coverage (>=7.0.5)", "flaky (>=3.7)", "pytest (>=7.2)", "pytest-xdist (>=3.1)"] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -219,6 +649,68 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "pyyaml" +version = "6.0.2" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, + {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"}, + {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"}, + {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"}, + {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"}, + {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"}, + {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"}, + {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"}, + {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"}, + {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"}, + {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"}, + {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"}, + {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"}, + {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"}, + {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"}, + {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"}, + {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"}, + {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"}, + {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"}, + {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"}, + {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"}, + {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"}, + {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"}, + {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"}, + {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"}, + {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"}, + {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"}, + {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"}, + {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"}, + {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"}, + {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"}, + {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"}, + {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"}, + {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, + {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, +] + [[package]] name = "requests" version = "2.32.3" @@ -257,6 +749,25 @@ requests = ">=2.22,<3" [package.extras] fixture = ["fixtures"] +[[package]] +name = "rich" +version = "13.7.1" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"}, + {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9\""} + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + [[package]] name = "ruff" version = "0.6.1" @@ -295,6 +806,25 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "textual" +version = "0.73.0" +description = "Modern Text User Interface framework" +optional = false +python-versions = "<4.0,>=3.8" +files = [ + {file = "textual-0.73.0-py3-none-any.whl", hash = "sha256:4d93d80d203f7fb7ba51828a546e8777019700d529a1b405ceee313dea2edfc2"}, + {file = "textual-0.73.0.tar.gz", hash = "sha256:ccd1e873370577f557dfdf2b3411f2a4f68b57d4365f9d83a00d084afb15f5a6"}, +] + +[package.dependencies] +markdown-it-py = {version = ">=2.1.0", extras = ["linkify", "plugins"]} +rich = ">=13.3.3" +typing-extensions = ">=4.4.0,<5.0.0" + +[package.extras] +syntax = ["tree-sitter (>=0.20.1,<0.21.0)", "tree-sitter-languages (==1.10.2)"] + [[package]] name = "tomli" version = "2.0.1" @@ -306,6 +836,47 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "typing-extensions" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +files = [ + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, +] + +[[package]] +name = "uc-micro-py" +version = "1.0.3" +description = "Micro subset of unicode data files for linkify-it-py projects." +optional = false +python-versions = ">=3.7" +files = [ + {file = "uc-micro-py-1.0.3.tar.gz", hash = "sha256:d321b92cff673ec58027c04015fcaa8bb1e005478643ff4a500882eaab88c48a"}, + {file = "uc_micro_py-1.0.3-py3-none-any.whl", hash = "sha256:db1dffff340817673d7b466ec86114a9dc0e9d4d9b5ba229d9d60e5c12600cd5"}, +] + +[package.extras] +test = ["coverage", "pytest", "pytest-cov"] + +[[package]] +name = "urllib3" +version = "1.26.19" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ + {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, + {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, +] + +[package.extras] +brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + [[package]] name = "urllib3" version = "2.2.2" @@ -323,7 +894,209 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "vcrpy" +version = "6.0.1" +description = "Automatically mock your HTTP interactions to simplify and speed up testing" +optional = false +python-versions = ">=3.8" +files = [ + {file = "vcrpy-6.0.1-py2.py3-none-any.whl", hash = "sha256:621c3fb2d6bd8aa9f87532c688e4575bcbbde0c0afeb5ebdb7e14cac409edfdd"}, + {file = "vcrpy-6.0.1.tar.gz", hash = "sha256:9e023fee7f892baa0bbda2f7da7c8ac51165c1c6e38ff8688683a12a4bde9278"}, +] + +[package.dependencies] +PyYAML = "*" +urllib3 = {version = "<2", markers = "platform_python_implementation == \"PyPy\" or python_version < \"3.10\""} +wrapt = "*" +yarl = "*" + +[package.extras] +tests = ["Werkzeug (==2.0.3)", "aiohttp", "boto3", "httplib2", "httpx", "pytest", "pytest-aiohttp", "pytest-asyncio", "pytest-cov", "pytest-httpbin", "requests (>=2.22.0)", "tornado", "urllib3"] + +[[package]] +name = "wrapt" +version = "1.16.0" +description = "Module for decorators, wrappers and monkey patching." +optional = false +python-versions = ">=3.6" +files = [ + {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"}, + {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"}, + {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"}, + {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"}, + {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"}, + {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"}, + {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"}, + {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"}, + {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"}, + {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"}, + {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"}, + {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"}, + {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"}, + {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"}, + {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"}, + {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"}, + {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"}, + {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"}, + {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"}, + {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"}, + {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"}, + {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"}, + {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"}, + {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"}, + {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"}, + {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"}, + {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"}, + {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"}, +] + +[[package]] +name = "yarl" +version = "1.9.4" +description = "Yet another URL library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"}, + {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"}, + {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"}, + {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"}, + {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"}, + {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"}, + {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"}, + {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"}, + {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"}, + {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"}, + {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"}, + {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"}, + {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"}, + {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"}, + {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"}, + {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"}, + {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"}, + {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"}, + {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"}, + {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"}, + {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"}, + {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"}, + {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"}, + {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"}, + {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"}, + {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"}, + {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"}, + {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"}, + {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"}, + {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"}, + {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"}, + {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"}, + {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"}, + {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"}, + {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"}, + {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"}, + {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"}, + {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"}, + {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"}, + {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"}, + {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"}, + {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"}, +] + +[package.dependencies] +idna = ">=2.0" +multidict = ">=4.0" + [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "d194c98c146afb5b1110c889dd9746762a7ed79baa7144b0ca9cab96d1995ca6" +content-hash = "3a26c9a4fba04babfcead3a954f3e6aa383db8b6f4f2e3134717340051d536a3" diff --git a/pyproject.toml b/pyproject.toml index d267a85..96a6ed7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,16 +19,26 @@ classifiers=[ 'Topic :: Text Processing :: Indexing', 'Topic :: Text Processing :: Markup :: XML', ] +packages = [ + { include = "usp" } +] [tool.poetry.dependencies] python = "^3.8" -python-dateutil = ">=2.1,<3.0.0" +python-dateutil = ">=2.7,<3.0.0" requests = ">=2.2.1" [tool.poetry.group.dev.dependencies] requests-mock = ">=1.6.0,<2.0" pytest = "^8.3.0" ruff = "^0.6.1" +vcrpy = "6.0.1" + +[tool.poetry.group.perf] +optional = true +[tool.poetry.group.perf.dependencies] +pytest-memray = "^1.7.0" +pyinstrument = "^4.7.2" [build-system] requires = ["poetry-core"] @@ -46,4 +56,8 @@ select = [ "F", "UP", "PT" -] \ No newline at end of file +] + +[tool.pytest.ini_options] +log_cli = true +log_cli_level = "WARNING" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/integration/README.md b/tests/integration/README.md new file mode 100644 index 0000000..63375f8 --- /dev/null +++ b/tests/integration/README.md @@ -0,0 +1,51 @@ +# Integration & Performance Tests + +These tests use [VCR.py](https://vcrpy.readthedocs.io/) cassettes to avoid making real HTTP requests. Due to the size of the cassettes, they are not included in this repository. + +## Downloading Cassettes + +Cassettes are distributed from releases in a [separate repository](https://github.com/GateNLP/usp-test-cassettes). For an overview of available cassettes, see [the manifest file](https://github.com/GateNLP/usp-test-cassettes/blob/main/manifest.json). + +Run `python3 download.py` to download and decompress all available cassettes into the `cassettes` directory. + +Some cassette files are quite large when decompressed (~400MB) but compress relatively efficiently (~30MB). + +> [!IMPORTANT] +> In USP's tests, VCR.py is configured to run in `none` record mode (HTTP requests not included in the cassette will cause failure). +> This means that code changes causing new HTTP requests will temporarily break performance tests until the cassettes can be updated. + +## Running Tests + +Integration tests must be manually enabled with the `--integration` flag. + +```bash +pytest --integration tests/integration +``` + +## Memory Profiling with Memray + +To profile memory usage during tests, run the test command with the `--memray` + +```bash +pytest --memray [--memray-bin-path memray] --integration tests/integration +``` + +Without the --memray-bin-path argument, this will measure memory usage and report at the end of the test run. +With the argument, it will output the memory usage reports to the `memray` directory, which can then be used to generate reports e.g. [a flamegraph](https://bloomberg.github.io/memray/flamegraph.html). + + +## Performance Profiling with Pyinstrument + +To profile performance during tests, run through the pyinstrument CLI: + +```bash +pyinstrument -m pytest --integration tests/integration +``` + +Pyinstrument does not distinguish between tests, so you may want to filter to a specific test at a time with -k. For example, to only run the bbc.co.uk test: + +```bash +pyinstrument -m pytest --integration -k bbc tests/integration +``` + +This can be viewed as an interactive HTML report by passing `-r html` to `pyinstrument` initially, or using the `--load-prev` command output at the end of the test run. \ No newline at end of file diff --git a/tests/integration/cassettes/.gitignore b/tests/integration/cassettes/.gitignore new file mode 100644 index 0000000..5e7a51a --- /dev/null +++ b/tests/integration/cassettes/.gitignore @@ -0,0 +1,2 @@ +*.yaml +manifest.json \ No newline at end of file diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 0000000..5045e49 --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,20 @@ +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--integration", action="store_true", default=False, help="run integration tests" + ) + + +def pytest_configure(config): + config.addinivalue_line("markers", "integration: mark test as an integration test") + +def pytest_collection_modifyitems(config, items): + if config.getoption("--integration"): + return + else: + skip_perf = pytest.mark.skip(reason="need --integration option to run") + for item in items: + if "integration" in item.keywords: + item.add_marker(skip_perf) \ No newline at end of file diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py new file mode 100644 index 0000000..c83c39f --- /dev/null +++ b/tests/integration/test_integration.py @@ -0,0 +1,42 @@ +import json +import logging +from pathlib import Path + +import pytest +import vcr + +from usp.tree import sitemap_tree_for_homepage + + +def pytest_generate_tests(metafunc): + # cassettes = list(Path(__file__).parent.joinpath('cassettes').glob('*.yaml')) + # cassette_names = [f"integration-{cassette.stem}" for cassette in cassettes] + # metafunc.parametrize('cassette_path', cassettes, ids=cassette_names, indirect=True) + cassettes_root = Path(__file__).parent / "cassettes" + + manifest_path = cassettes_root / "manifest.json" + if not manifest_path.exists(): + return + + manifest = json.loads(manifest_path.read_text()) + cassette_fixtures = [(url, cassettes_root / item['name']) for url, item in manifest.items()] + cassette_ids = [f"integration-{url}" for url, _ in cassette_fixtures] + metafunc.parametrize('site_url,cassette_path', cassette_fixtures, ids=cassette_ids) + +@pytest.fixture +def with_vcr(cassette_path): + with vcr.use_cassette(cassette_path, record_mode='none'): + yield + +@pytest.mark.usefixtures('with_vcr') +@pytest.mark.integration +def test_integration(site_url, cassette_path): + print(f"Loading {cassette_path}") + sitemap = sitemap_tree_for_homepage(site_url) + + # Do this over converting to a list() as this will load all pages into memory + # That would always be the largest memory use so would prevent measurement of the mid-process memory use + page_count = 0 + for page in sitemap.all_pages(): + page_count += 1 + print(f"Site {site_url} has {page_count} pages") \ No newline at end of file diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index e39ab1a..f814182 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -84,7 +84,7 @@ def __init__( self._recursion_level = recursion_level def sitemap(self) -> AbstractSitemap: - log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...") + log.warning(f"Fetching level {self._recursion_level} sitemap from {self._url}...") response = get_url_retry_on_client_errors( url=self._url, web_client=self._web_client ) @@ -126,7 +126,7 @@ def sitemap(self) -> AbstractSitemap: web_client=self._web_client, ) - log.info(f"Parsing sitemap from URL {self._url}...") + log.warning(f"Parsing sitemap from URL {self._url}...") sitemap = parser.sitemap() return sitemap @@ -628,6 +628,7 @@ def page(self) -> Optional[SitemapPage]: __slots__ = [ "_current_page", "_pages", + "_page_urls" ] def __init__(self, url: str): @@ -635,6 +636,7 @@ def __init__(self, url: str): self._current_page = None self._pages = [] + self._page_urls = set() def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: super().xml_element_start(name=name, attrs=attrs) @@ -659,8 +661,9 @@ def xml_element_end(self, name: str) -> None: ) if name == "sitemap:url": - if self._current_page not in self._pages: + if self._current_page.url not in self._page_urls: self._pages.append(self._current_page) + self._page_urls.add(self._current_page.url) self._current_page = None else: @@ -788,6 +791,7 @@ def page(self) -> Optional[SitemapPage]: __slots__ = [ "_current_page", "_pages", + "_page_links" ] def __init__(self, url: str): @@ -795,6 +799,7 @@ def __init__(self, url: str): self._current_page = None self._pages = [] + self._page_links = set() def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: super().xml_element_start(name=name, attrs=attrs) @@ -816,8 +821,9 @@ def xml_element_end(self, name: str) -> None: # If within already if self._current_page: if name == "item": - if self._current_page not in self._pages: + if self._current_page.link not in self._page_links: self._pages.append(self._current_page) + self._page_links.add(self._current_page.link) self._current_page = None else: @@ -920,6 +926,7 @@ def page(self) -> Optional[SitemapPage]: __slots__ = [ "_current_page", "_pages", + "_page_links", "_last_link_rel_self_href", ] @@ -928,6 +935,7 @@ def __init__(self, url: str): self._current_page = None self._pages = [] + self._page_links = set() self._last_link_rel_self_href = None def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: @@ -962,8 +970,9 @@ def xml_element_end(self, name: str) -> None: self._current_page.link = self._last_link_rel_self_href self._last_link_rel_self_href = None - if self._current_page not in self._pages: + if self._current_page.link not in self._page_links: self._pages.append(self._current_page) + self._page_links.add(self._current_page.link) self._current_page = None diff --git a/usp/helpers.py b/usp/helpers.py index 4037faa..7364987 100644 --- a/usp/helpers.py +++ b/usp/helpers.py @@ -4,11 +4,13 @@ import gzip as gzip_lib import html import re +import sys import time from typing import Optional from urllib.parse import urlparse, unquote_plus, urlunparse from dateutil.parser import parse as dateutil_parse +from dateutil.parser import isoparse as dateutil_isoparse from .exceptions import SitemapException, GunzipException, StripURLToHomepageException from .log import create_logger @@ -24,6 +26,8 @@ __URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE) """Regular expression to match HTTP(s) URLs.""" +HAS_DATETIME_NEW_ISOPARSER = sys.version_info >= (3, 11) + def is_http_url(url: str) -> bool: """ @@ -94,9 +98,16 @@ def parse_iso8601_date(date_string: str) -> datetime.datetime: if not date_string: raise SitemapException("Date string is unset.") - date = dateutil_parse(date_string) + if HAS_DATETIME_NEW_ISOPARSER: + # From Python 3.11, fromisosort is able to parse nearly any valid ISO 8601 string + return datetime.datetime.fromisoformat(date_string) - return date + try: + # Try the more efficient ISO 8601 parser + return dateutil_isoparse(date_string) + except ValueError: + # Try the less efficient general parser + return dateutil_parse(date_string) def parse_rfc2822_date(date_string: str) -> datetime.datetime: @@ -107,7 +118,12 @@ def parse_rfc2822_date(date_string: str) -> datetime.datetime: :return: datetime.datetime object of a parsed date. """ # FIXME parse known date formats faster - return parse_iso8601_date(date_string) + if not date_string: + raise SitemapException("Date string is unset.") + + date = dateutil_parse(date_string) + + return date def get_url_retry_on_client_errors( @@ -163,8 +179,9 @@ def __response_is_gzipped_data( uri = urlparse(url) url_path = unquote_plus(uri.path) content_type = response.header("content-type") or "" + content_encoding = response.header("content-encoding") or "" - if url_path.lower().endswith(".gz") or "gzip" in content_type.lower(): + if url_path.lower().endswith(".gz") or "gzip" in content_type.lower() or "gzip" in content_encoding.lower(): return True else: From 180923b3321d357b63e2276ded61e7e881029512 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sun, 18 Aug 2024 14:26:32 +0100 Subject: [PATCH 29/79] Ruff --- tests/integration/conftest.py | 8 ++++++-- tests/integration/test_integration.py | 17 ++++++++++------- usp/fetch_parse.py | 16 ++++------------ usp/helpers.py | 6 +++++- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 5045e49..c5d5790 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -3,13 +3,17 @@ def pytest_addoption(parser): parser.addoption( - "--integration", action="store_true", default=False, help="run integration tests" + "--integration", + action="store_true", + default=False, + help="run integration tests", ) def pytest_configure(config): config.addinivalue_line("markers", "integration: mark test as an integration test") + def pytest_collection_modifyitems(config, items): if config.getoption("--integration"): return @@ -17,4 +21,4 @@ def pytest_collection_modifyitems(config, items): skip_perf = pytest.mark.skip(reason="need --integration option to run") for item in items: if "integration" in item.keywords: - item.add_marker(skip_perf) \ No newline at end of file + item.add_marker(skip_perf) diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index c83c39f..75c5206 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -1,5 +1,4 @@ import json -import logging from pathlib import Path import pytest @@ -19,16 +18,20 @@ def pytest_generate_tests(metafunc): return manifest = json.loads(manifest_path.read_text()) - cassette_fixtures = [(url, cassettes_root / item['name']) for url, item in manifest.items()] + cassette_fixtures = [ + (url, cassettes_root / item["name"]) for url, item in manifest.items() + ] cassette_ids = [f"integration-{url}" for url, _ in cassette_fixtures] - metafunc.parametrize('site_url,cassette_path', cassette_fixtures, ids=cassette_ids) + metafunc.parametrize("site_url,cassette_path", cassette_fixtures, ids=cassette_ids) + @pytest.fixture -def with_vcr(cassette_path): - with vcr.use_cassette(cassette_path, record_mode='none'): +def _with_vcr(cassette_path): + with vcr.use_cassette(cassette_path, record_mode="none"): yield -@pytest.mark.usefixtures('with_vcr') + +@pytest.mark.usefixtures("_with_vcr") @pytest.mark.integration def test_integration(site_url, cassette_path): print(f"Loading {cassette_path}") @@ -39,4 +42,4 @@ def test_integration(site_url, cassette_path): page_count = 0 for page in sitemap.all_pages(): page_count += 1 - print(f"Site {site_url} has {page_count} pages") \ No newline at end of file + print(f"Site {site_url} has {page_count} pages") diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index f814182..c0ad92f 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -84,7 +84,7 @@ def __init__( self._recursion_level = recursion_level def sitemap(self) -> AbstractSitemap: - log.warning(f"Fetching level {self._recursion_level} sitemap from {self._url}...") + log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...") response = get_url_retry_on_client_errors( url=self._url, web_client=self._web_client ) @@ -126,7 +126,7 @@ def sitemap(self) -> AbstractSitemap: web_client=self._web_client, ) - log.warning(f"Parsing sitemap from URL {self._url}...") + log.info(f"Parsing sitemap from URL {self._url}...") sitemap = parser.sitemap() return sitemap @@ -625,11 +625,7 @@ def page(self) -> Optional[SitemapPage]: news_story=sitemap_news_story, ) - __slots__ = [ - "_current_page", - "_pages", - "_page_urls" - ] + __slots__ = ["_current_page", "_pages", "_page_urls"] def __init__(self, url: str): super().__init__(url=url) @@ -788,11 +784,7 @@ def page(self) -> Optional[SitemapPage]: ), ) - __slots__ = [ - "_current_page", - "_pages", - "_page_links" - ] + __slots__ = ["_current_page", "_pages", "_page_links"] def __init__(self, url: str): super().__init__(url=url) diff --git a/usp/helpers.py b/usp/helpers.py index 7364987..5f7e27f 100644 --- a/usp/helpers.py +++ b/usp/helpers.py @@ -181,7 +181,11 @@ def __response_is_gzipped_data( content_type = response.header("content-type") or "" content_encoding = response.header("content-encoding") or "" - if url_path.lower().endswith(".gz") or "gzip" in content_type.lower() or "gzip" in content_encoding.lower(): + if ( + url_path.lower().endswith(".gz") + or "gzip" in content_type.lower() + or "gzip" in content_encoding.lower() + ): return True else: From f1340d30dae4e5a802374eea6c69e5565bf11ecf Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 30 Aug 2024 15:29:14 +0100 Subject: [PATCH 30/79] Support using a custom XML parser --- usp/fetch_parse.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index c0ad92f..c0951a7 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -1,11 +1,18 @@ -"""Sitemap fetchers and parsers.""" +"""Sitemap fetchers and parsers. + +.. seealso:: + + :doc:`Reference of classes used for each format ` + + :doc:`Overview of parse process ` +""" import abc import re import xml.parsers.expat from collections import OrderedDict from decimal import Decimal -from typing import Optional, Dict +from typing import Any, Optional, Dict, Callable from .exceptions import SitemapException, SitemapXMLParsingException from .helpers import ( @@ -42,6 +49,12 @@ log = create_logger(__name__) +# TODO: defusedxml example +CUSTOM_XML_PARSE_CREATE: Optional[Callable[[], Any]] = None +"""Specify an alternate method to use when creating XML parsers. + +This method will be called with no arguments and must return an object with the same interface as :func:`xml.parsers.expat.ParserCreate`. +""" class SitemapFetcher: """robots.txt / XML / plain text sitemap fetcher.""" @@ -268,9 +281,12 @@ def __init__( self._concrete_parser = None def sitemap(self) -> AbstractSitemap: - parser = xml.parsers.expat.ParserCreate( - namespace_separator=self.__XML_NAMESPACE_SEPARATOR - ) + if CUSTOM_XML_PARSE_CREATE is not None: + parser = CUSTOM_XML_PARSE_CREATE() + else: + parser = xml.parsers.expat.ParserCreate( + namespace_separator=self.__XML_NAMESPACE_SEPARATOR + ) parser.StartElementHandler = self._xml_element_start parser.EndElementHandler = self._xml_element_end parser.CharacterDataHandler = self._xml_char_data @@ -857,9 +873,9 @@ class PagesAtomSitemapParser(AbstractXMLSitemapParser): """ Pages Atom 0.3 / 1.0 sitemap parser. - https://github.com/simplepie/simplepie-ng/wiki/Spec:-Atom-0.3 - https://www.ietf.org/rfc/rfc4287.txt - http://rakaz.nl/2005/07/moving-from-atom-03-to-10.html + - https://github.com/simplepie/simplepie-ng/wiki/Spec:-Atom-0.3 + - https://www.ietf.org/rfc/rfc4287.txt + - http://rakaz.nl/2005/07/moving-from-atom-03-to-10.html """ # FIXME merge with RSS parser class as there are too many similarities From f6726cfd87a0b3157bfbc03b989729bcffa0d815 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 30 Aug 2024 15:31:45 +0100 Subject: [PATCH 31/79] Correct fallback when parsing ISO dates with native function --- usp/helpers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/usp/helpers.py b/usp/helpers.py index 5f7e27f..fcf0b22 100644 --- a/usp/helpers.py +++ b/usp/helpers.py @@ -98,11 +98,10 @@ def parse_iso8601_date(date_string: str) -> datetime.datetime: if not date_string: raise SitemapException("Date string is unset.") - if HAS_DATETIME_NEW_ISOPARSER: - # From Python 3.11, fromisosort is able to parse nearly any valid ISO 8601 string - return datetime.datetime.fromisoformat(date_string) - try: + if HAS_DATETIME_NEW_ISOPARSER: + # From Python 3.11, fromisosort is able to parse nearly any valid ISO 8601 string + return datetime.datetime.fromisoformat(date_string) # Try the more efficient ISO 8601 parser return dateutil_isoparse(date_string) except ValueError: @@ -118,6 +117,7 @@ def parse_rfc2822_date(date_string: str) -> datetime.datetime: :return: datetime.datetime object of a parsed date. """ # FIXME parse known date formats faster + # TODO: fix naming of this function as it shouldn't actually be RFC2822 if not date_string: raise SitemapException("Date string is unset.") From abaeeb7ac52f5818fd3ce95d635399e3a467fb3e Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 30 Aug 2024 15:32:46 +0100 Subject: [PATCH 32/79] Add consistent tree traversal interface --- tests/test_tree.py | 8 ++++ usp/objects/sitemap.py | 96 +++++++++++++++++++++++++++++++++++------- 2 files changed, 88 insertions(+), 16 deletions(-) diff --git a/tests/test_tree.py b/tests/test_tree.py index 7168e04..3b995c2 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -390,6 +390,7 @@ def test_sitemap_tree_for_homepage(self, requests_mock): assert expected_sitemap_tree == actual_sitemap_tree, diff_str assert len(list(actual_sitemap_tree.all_pages())) == 6 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 7 def test_sitemap_tree_for_homepage_gzip(self, requests_mock): """Test sitemap_tree_for_homepage() with gzipped sitemaps.""" @@ -598,6 +599,8 @@ def test_sitemap_tree_for_homepage_plain_text(self, requests_mock): assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/bar.html") in pages assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/baz.html") in pages + assert len(list(actual_sitemap_tree.all_sitemaps())) == 3 + # noinspection DuplicatedCode def test_sitemap_tree_for_homepage_rss_atom(self, requests_mock): """Test sitemap_tree_for_homepage() with RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" @@ -827,6 +830,7 @@ def test_sitemap_tree_for_homepage_rss_atom(self, requests_mock): assert expected_sitemap_tree == actual_sitemap_tree, diff_str assert len(list(actual_sitemap_tree.all_pages())) == 6 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 4 def test_sitemap_tree_for_homepage_rss_atom_empty(self, requests_mock): """Test sitemap_tree_for_homepage() with empty RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" @@ -935,6 +939,7 @@ def test_sitemap_tree_for_homepage_rss_atom_empty(self, requests_mock): assert expected_sitemap_tree == actual_sitemap_tree assert len(list(actual_sitemap_tree.all_pages())) == 0 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 4 def test_sitemap_tree_for_homepage_prematurely_ending_xml(self, requests_mock): """Test sitemap_tree_for_homepage() with clipped XML. @@ -1268,6 +1273,7 @@ def test_sitemap_tree_for_homepage_huge_sitemap(self, requests_mock): actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) assert len(list(actual_sitemap_tree.all_pages())) == page_count + assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self, requests_mock): """Test sitemap_tree_for_homepage() with weird (but valid) spacing.""" @@ -1315,6 +1321,7 @@ def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self, requests_mock) actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) assert len(list(actual_sitemap_tree.all_pages())) == 1 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 def test_sitemap_tree_for_homepage_utf8_bom(self, requests_mock): """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap.""" @@ -1371,3 +1378,4 @@ def test_sitemap_tree_for_homepage_utf8_bom(self, requests_mock): actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) assert len(list(actual_sitemap_tree.all_pages())) == 1 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 diff --git a/usp/objects/sitemap.py b/usp/objects/sitemap.py index 0cac96a..df83b41 100644 --- a/usp/objects/sitemap.py +++ b/usp/objects/sitemap.py @@ -1,4 +1,12 @@ -"""Objects that represent one of the found sitemaps.""" +"""Objects that represent one of the found sitemaps. + +.. seealso:: + + :doc:`Reference of classes used for each format ` + +.. inheritance-diagram:: AbstractSitemap InvalidSitemap AbstractIndexSitemap IndexWebsiteSitemap IndexXMLSitemap IndexRobotsTxtSitemap AbstractPagesSitemap PagesXMLSitemap PagesTextSitemap PagesRSSSitemap PagesAtomSitemap + :parts: 1 +""" import abc import os @@ -50,15 +58,47 @@ def url(self) -> str: """ return self.__url + @property @abc.abstractmethod + def pages(self) -> List[SitemapPage]: + """ + Return a list of pages found in a sitemap (if any). + + Should return an empty list if this sitemap cannot have sub-pages, to allow traversal with a consistent interface. + + :return: the list of pages, or an empty list. + """ + raise NotImplementedError("Abstract method") + + # TODO: return custom iterator with set length here? def all_pages(self) -> Iterator[SitemapPage]: """ Return iterator which yields all pages of this sitemap and linked sitemaps (if any). :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). """ + yield from self.pages + + @property + @abc.abstractmethod + def sub_sitemaps(self) -> List["AbstractSitemap"]: + """ + Return a list of sub-sitemaps of this sitemap (if any). + + Should return an empty list if this sitemap cannot have sub-pages, to allow traversal with a consistent interface. + + :return: the list of sub-sitemaps, or an empty list. + """ raise NotImplementedError("Abstract method") + def all_sitemaps(self) -> Iterator["AbstractSitemap"]: + """ + Return iterator which yields all sub-sitemaps descended from this sitemap. + + :return: Iterator which yields all sub-sitemaps descended from this sitemap. + """ + yield from self.sub_sitemaps + class InvalidSitemap(AbstractSitemap): """Invalid sitemap, e.g. the one that can't be parsed.""" @@ -106,13 +146,23 @@ def reason(self) -> str: """ return self.__reason - def all_pages(self) -> Iterator[SitemapPage]: + @property + def pages(self) -> List[SitemapPage]: """ - Return iterator which yields all pages of this sitemap and linked sitemaps (if any). + Return an empty list of pages, as invalid sitemaps have no pages. - :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). + :return: Empty list of pages. + """ + return [] + + @property + def sub_sitemaps(self) -> List["AbstractSitemap"]: """ - yield from [] + Return an empty list of sub-sitemaps, as invalid sitemaps have no sub-sitemaps. + + :return: Empty list of sub-sitemaps. + """ + return [] class AbstractPagesSitemap(AbstractSitemap, metaclass=abc.ABCMeta): @@ -158,22 +208,22 @@ def __repr__(self): @property def pages(self) -> List[SitemapPage]: """ - Return list of pages found in a sitemap. + Load pages from disk swap file and return them. - :return: List of pages found in a sitemap. + :return: List of pages found in the sitemap. """ with open(self.__pages_temp_file_path, "rb") as tmp: pages = pickle.load(tmp) return pages - def all_pages(self) -> Iterator[SitemapPage]: + @property + def sub_sitemaps(self) -> List["AbstractSitemap"]: """ - Return iterator which yields all pages of this sitemap and linked sitemaps (if any). + Return an empty list of sub-sitemaps, as pages sitemaps have no sub-sitemaps. - :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). + :return: Empty list of sub-sitemaps. """ - yield from self.pages - + return [] class PagesXMLSitemap(AbstractPagesSitemap): """ @@ -247,13 +297,17 @@ def __repr__(self): ) @property - def sub_sitemaps(self) -> List[AbstractSitemap]: + def sub_sitemaps(self) -> List["AbstractSitemap"]: + return self.__sub_sitemaps + + @property + def pages(self) -> List[SitemapPage]: """ - Return sub-sitemaps that are linked to from this sitemap. + Return an empty list of pages, as index sitemaps have no pages. - :return: Sub-sitemaps that are linked to from this sitemap. + :return: Empty list of pages. """ - return self.__sub_sitemaps + return [] def all_pages(self) -> Iterator[SitemapPage]: """ @@ -264,6 +318,16 @@ def all_pages(self) -> Iterator[SitemapPage]: for sub_sitemap in self.sub_sitemaps: yield from sub_sitemap.all_pages() + def all_sitemaps(self) -> Iterator["AbstractSitemap"]: + """ + Return iterator which yields all sub-sitemaps of this sitemap. + + :return: Iterator which yields all sub-sitemaps of this sitemap. + """ + for sub_sitemap in self.sub_sitemaps: + yield sub_sitemap + yield from sub_sitemap.all_sitemaps() + class IndexWebsiteSitemap(AbstractIndexSitemap): """ From 791a93b1f9bad489fcf0811745845b39d2d0e686 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 30 Aug 2024 15:34:35 +0100 Subject: [PATCH 33/79] Ruff --- usp/fetch_parse.py | 1 + usp/objects/sitemap.py | 1 + 2 files changed, 2 insertions(+) diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index c0951a7..c41405e 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -56,6 +56,7 @@ This method will be called with no arguments and must return an object with the same interface as :func:`xml.parsers.expat.ParserCreate`. """ + class SitemapFetcher: """robots.txt / XML / plain text sitemap fetcher.""" diff --git a/usp/objects/sitemap.py b/usp/objects/sitemap.py index df83b41..3a2be4d 100644 --- a/usp/objects/sitemap.py +++ b/usp/objects/sitemap.py @@ -225,6 +225,7 @@ def sub_sitemaps(self) -> List["AbstractSitemap"]: """ return [] + class PagesXMLSitemap(AbstractPagesSitemap): """ XML sitemap that contains URLs to pages. From c18eb60baeb66215ecc9ad1b7f80579ee9fe3733 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 31 Aug 2024 12:36:27 +0100 Subject: [PATCH 34/79] Move __version__ to usp.__init__ --- usp/__about__.py | 3 --- usp/__init__.py | 5 ++++- usp/web_client/requests_client.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) delete mode 100644 usp/__about__.py diff --git a/usp/__about__.py b/usp/__about__.py deleted file mode 100644 index 3aad052..0000000 --- a/usp/__about__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Package version.""" - -__version__ = "0.6" diff --git a/usp/__init__.py b/usp/__init__.py index 8f290bb..0752510 100644 --- a/usp/__init__.py +++ b/usp/__init__.py @@ -1 +1,4 @@ -__all__ = ["tree"] +from importlib.metadata import version +__version__ = version("ultimate-sitemap-parser") + +__all__ = ["tree", "__version__"] \ No newline at end of file diff --git a/usp/web_client/requests_client.py b/usp/web_client/requests_client.py index 35aa6d6..c0b3696 100644 --- a/usp/web_client/requests_client.py +++ b/usp/web_client/requests_client.py @@ -12,7 +12,7 @@ WebClientErrorResponse, RETRYABLE_HTTP_STATUS_CODES, ) -from usp.__about__ import __version__ +from usp import __version__ class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse): From 8c9677114ccc6f5769ce6bb2de7c0cd3709cbada Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 31 Aug 2024 12:37:28 +0100 Subject: [PATCH 35/79] Add CLI and ls tool --- pyproject.toml | 3 ++ usp/cli/__init__.py | 1 + usp/cli/_ls.py | 72 +++++++++++++++++++++++++++++++++++++++++++++ usp/cli/_util.py | 20 +++++++++++++ usp/cli/cli.py | 24 +++++++++++++++ usp/tree.py | 60 ++++++++++++++++++++----------------- 6 files changed, 153 insertions(+), 27 deletions(-) create mode 100644 usp/cli/__init__.py create mode 100644 usp/cli/_ls.py create mode 100644 usp/cli/_util.py create mode 100644 usp/cli/cli.py diff --git a/pyproject.toml b/pyproject.toml index 96a6ed7..5d94298 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,9 @@ packages = [ { include = "usp" } ] +[tool.poetry.scripts] +usp = 'usp.cli:main' + [tool.poetry.dependencies] python = "^3.8" python-dateutil = ">=2.7,<3.0.0" diff --git a/usp/cli/__init__.py b/usp/cli/__init__.py new file mode 100644 index 0000000..cb2c40b --- /dev/null +++ b/usp/cli/__init__.py @@ -0,0 +1 @@ +from usp.cli.cli import main as main \ No newline at end of file diff --git a/usp/cli/_ls.py b/usp/cli/_ls.py new file mode 100644 index 0000000..d9acfa7 --- /dev/null +++ b/usp/cli/_ls.py @@ -0,0 +1,72 @@ +import argparse +import sys +from typing import Iterator + +from usp.cli._util import tabs, format_help +from usp.objects.sitemap import AbstractSitemap +from usp.tree import sitemap_tree_for_homepage + +LS_FORMATS = { + "tabtree": "Sitemaps and pages, nested with tab indentation", + "pages": "Flat list of pages, one per line" +} + + +def register(subparsers): + ls_parser = subparsers.add_parser('ls', help="List sitemap pages", description="download, parse and list the sitemap structure", formatter_class=argparse.RawTextHelpFormatter) + ls_parser.add_argument("url", type=str, help="URL of the site including protocol") + ls_parser.add_argument("-f", "--format", choices=LS_FORMATS, default="tabtree", help=format_help(LS_FORMATS, "set output format"), metavar='') + ls_parser.add_argument("-r", "--no-robots", action="store_true", help="don't discover sitemaps through robots.txt") + ls_parser.add_argument("-k", "--no-known", action="store_true", help="don't discover sitemaps through well-known URLs") + ls_parser.add_argument("-u", "--keep-url", action="store_true", help="don't strip the supplied URL from each page and sitemap URL") + ls_parser.set_defaults(page_only=False, no_robots=False, no_known=False, keep_url=False) + + ls_parser.set_defaults(func=ls) + +def _strip_url(url: str, prefix: str): + url = url.removeprefix(prefix) + + if not url.startswith('/') and prefix != "": + return '/' + url + return url + +def _list_page_urls(sitemap: AbstractSitemap, prefix: str = "") -> Iterator[str]: + for page in sitemap.all_pages(): + yield prefix + page.url + + +def _output_sitemap_nested(sitemap: AbstractSitemap, strip_prefix: str = "", depth: int = 0): + sitemap_url = sitemap.url + if depth != 0: + sitemap_url = _strip_url(sitemap_url, strip_prefix) + sys.stdout.write(tabs(depth) + sitemap_url + "\n") + + for sub_map in sitemap.sub_sitemaps: + _output_sitemap_nested(sub_map, strip_prefix, depth + 1) + + for page in sitemap.pages: + sys.stdout.write(tabs(depth + 1) + _strip_url(page.url, strip_prefix) + "\n") + +def _output_pages(sitemap: AbstractSitemap, strip_prefix: str = ""): + for page in sitemap.all_pages(): + sys.stdout.write(_strip_url(page.url, strip_prefix) + "\n") + +def ls(args): + tree = sitemap_tree_for_homepage( + args.url, + use_robots=not args.no_robots, + use_known_paths=not args.no_known, + ) + + strip_prefix = "" + if not args.keep_url: + strip_prefix = tree.url + + if args.format == "pages": + _output_pages(tree, strip_prefix) + elif args.format == "tabtree": + _output_sitemap_nested(tree, strip_prefix) + else: + raise NotImplementedError(f"Format '{args.format}' not implemented") + + exit(0) diff --git a/usp/cli/_util.py b/usp/cli/_util.py new file mode 100644 index 0000000..20c8cda --- /dev/null +++ b/usp/cli/_util.py @@ -0,0 +1,20 @@ +from typing import Dict + +def format_help(choices: Dict[str, str], opt_help: str) -> str: + """Generate help text for argparse choices. + + :param choices: Dictionary of choices {choice: help} + :param opt_help: Help text for the option: + :return: Help text for argparse choices. + """ + h = f"{opt_help} (default: %(default)s)\nchoices:\n" + + for fmt, key in choices.items(): + h += f" {fmt}: {key}\n" + + return h + + +def tabs(n: int): + """Generate n tabs.""" + return "\t" * n diff --git a/usp/cli/cli.py b/usp/cli/cli.py new file mode 100644 index 0000000..7193b71 --- /dev/null +++ b/usp/cli/cli.py @@ -0,0 +1,24 @@ +from argparse import ArgumentParser + +from usp.cli import _ls as ls_cmd +from usp import __version__ + +def main(): + parser = ArgumentParser(prog="usp", description="Ultimate Sitemap Parser") + parser.add_argument("--version", "-v", action="version", version=f"%(prog)s v{__version__}") + + subparsers = parser.add_subparsers(required=False, title="commands", metavar='') + ls_cmd.register(subparsers) + + args = parser.parse_args() + + if "func" in args: + args.func(args) + else: + parser.print_help() + + exit(0) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/usp/tree.py b/usp/tree.py index 5759355..3a7ad23 100644 --- a/usp/tree.py +++ b/usp/tree.py @@ -36,13 +36,17 @@ def sitemap_tree_for_homepage( - homepage_url: str, web_client: Optional[AbstractWebClient] = None + homepage_url: str, web_client: Optional[AbstractWebClient] = None, + use_robots: bool = True, + use_known_paths: bool = True ) -> AbstractSitemap: """ Using a homepage URL, fetch the tree of sitemaps and pages listed in them. :param homepage_url: Homepage URL of a website to fetch the sitemap tree for, e.g. "http://www.example.com/". :param web_client: Web client implementation to use for fetching sitemaps. + :param use_robots: Whether to discover sitemaps through robots.txt. + :param use_known_paths: Whether to discover sitemaps through common known paths. :return: Root sitemap object of the fetched sitemap tree. """ @@ -62,33 +66,35 @@ def sitemap_tree_for_homepage( sitemaps = [] - robots_txt_fetcher = SitemapFetcher( - url=robots_txt_url, web_client=web_client, recursion_level=0 - ) - robots_txt_sitemap = robots_txt_fetcher.sitemap() - if not isinstance(robots_txt_sitemap, InvalidSitemap): - sitemaps.append(robots_txt_sitemap) - sitemap_urls_found_in_robots_txt = set() - if isinstance(robots_txt_sitemap, IndexRobotsTxtSitemap): - for sub_sitemap in robots_txt_sitemap.sub_sitemaps: - sitemap_urls_found_in_robots_txt.add(sub_sitemap.url) - - for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS: - unpublished_sitemap_url = homepage_url + unpublished_sitemap_path - - # Don't refetch URLs already found in robots.txt - if unpublished_sitemap_url not in sitemap_urls_found_in_robots_txt: - unpublished_sitemap_fetcher = SitemapFetcher( - url=unpublished_sitemap_url, - web_client=web_client, - recursion_level=0, - ) - unpublished_sitemap = unpublished_sitemap_fetcher.sitemap() - - # Skip the ones that weren't found - if not isinstance(unpublished_sitemap, InvalidSitemap): - sitemaps.append(unpublished_sitemap) + if use_robots: + robots_txt_fetcher = SitemapFetcher( + url=robots_txt_url, web_client=web_client, recursion_level=0 + ) + robots_txt_sitemap = robots_txt_fetcher.sitemap() + if not isinstance(robots_txt_sitemap, InvalidSitemap): + sitemaps.append(robots_txt_sitemap) + + if isinstance(robots_txt_sitemap, IndexRobotsTxtSitemap): + for sub_sitemap in robots_txt_sitemap.all_sitemaps(): + sitemap_urls_found_in_robots_txt.add(sub_sitemap.url) + + if use_known_paths: + for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS: + unpublished_sitemap_url = homepage_url + unpublished_sitemap_path + + # Don't refetch URLs already found in robots.txt + if unpublished_sitemap_url not in sitemap_urls_found_in_robots_txt: + unpublished_sitemap_fetcher = SitemapFetcher( + url=unpublished_sitemap_url, + web_client=web_client, + recursion_level=0, + ) + unpublished_sitemap = unpublished_sitemap_fetcher.sitemap() + + # Skip the ones that weren't found + if not isinstance(unpublished_sitemap, InvalidSitemap): + sitemaps.append(unpublished_sitemap) index_sitemap = IndexWebsiteSitemap(url=homepage_url, sub_sitemaps=sitemaps) From 2df66f10550cb9d894c414a1aee985f4ef4e1567 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 31 Aug 2024 12:38:22 +0100 Subject: [PATCH 36/79] Ruff --- usp/__init__.py | 3 ++- usp/cli/__init__.py | 2 +- usp/cli/_ls.py | 55 ++++++++++++++++++++++++++++++++++++--------- usp/cli/_util.py | 1 + usp/cli/cli.py | 9 +++++--- usp/tree.py | 7 +++--- 6 files changed, 59 insertions(+), 18 deletions(-) diff --git a/usp/__init__.py b/usp/__init__.py index 0752510..b766aa3 100644 --- a/usp/__init__.py +++ b/usp/__init__.py @@ -1,4 +1,5 @@ from importlib.metadata import version + __version__ = version("ultimate-sitemap-parser") -__all__ = ["tree", "__version__"] \ No newline at end of file +__all__ = ["tree", "__version__"] diff --git a/usp/cli/__init__.py b/usp/cli/__init__.py index cb2c40b..004b3c3 100644 --- a/usp/cli/__init__.py +++ b/usp/cli/__init__.py @@ -1 +1 @@ -from usp.cli.cli import main as main \ No newline at end of file +from usp.cli.cli import main as main diff --git a/usp/cli/_ls.py b/usp/cli/_ls.py index d9acfa7..6be8ebe 100644 --- a/usp/cli/_ls.py +++ b/usp/cli/_ls.py @@ -8,34 +8,67 @@ LS_FORMATS = { "tabtree": "Sitemaps and pages, nested with tab indentation", - "pages": "Flat list of pages, one per line" + "pages": "Flat list of pages, one per line", } def register(subparsers): - ls_parser = subparsers.add_parser('ls', help="List sitemap pages", description="download, parse and list the sitemap structure", formatter_class=argparse.RawTextHelpFormatter) + ls_parser = subparsers.add_parser( + "ls", + help="List sitemap pages", + description="download, parse and list the sitemap structure", + formatter_class=argparse.RawTextHelpFormatter, + ) ls_parser.add_argument("url", type=str, help="URL of the site including protocol") - ls_parser.add_argument("-f", "--format", choices=LS_FORMATS, default="tabtree", help=format_help(LS_FORMATS, "set output format"), metavar='') - ls_parser.add_argument("-r", "--no-robots", action="store_true", help="don't discover sitemaps through robots.txt") - ls_parser.add_argument("-k", "--no-known", action="store_true", help="don't discover sitemaps through well-known URLs") - ls_parser.add_argument("-u", "--keep-url", action="store_true", help="don't strip the supplied URL from each page and sitemap URL") - ls_parser.set_defaults(page_only=False, no_robots=False, no_known=False, keep_url=False) + ls_parser.add_argument( + "-f", + "--format", + choices=LS_FORMATS, + default="tabtree", + help=format_help(LS_FORMATS, "set output format"), + metavar="", + ) + ls_parser.add_argument( + "-r", + "--no-robots", + action="store_true", + help="don't discover sitemaps through robots.txt", + ) + ls_parser.add_argument( + "-k", + "--no-known", + action="store_true", + help="don't discover sitemaps through well-known URLs", + ) + ls_parser.add_argument( + "-u", + "--keep-url", + action="store_true", + help="don't strip the supplied URL from each page and sitemap URL", + ) + ls_parser.set_defaults( + page_only=False, no_robots=False, no_known=False, keep_url=False + ) ls_parser.set_defaults(func=ls) + def _strip_url(url: str, prefix: str): url = url.removeprefix(prefix) - if not url.startswith('/') and prefix != "": - return '/' + url + if not url.startswith("/") and prefix != "": + return "/" + url return url + def _list_page_urls(sitemap: AbstractSitemap, prefix: str = "") -> Iterator[str]: for page in sitemap.all_pages(): yield prefix + page.url -def _output_sitemap_nested(sitemap: AbstractSitemap, strip_prefix: str = "", depth: int = 0): +def _output_sitemap_nested( + sitemap: AbstractSitemap, strip_prefix: str = "", depth: int = 0 +): sitemap_url = sitemap.url if depth != 0: sitemap_url = _strip_url(sitemap_url, strip_prefix) @@ -47,10 +80,12 @@ def _output_sitemap_nested(sitemap: AbstractSitemap, strip_prefix: str = "", dep for page in sitemap.pages: sys.stdout.write(tabs(depth + 1) + _strip_url(page.url, strip_prefix) + "\n") + def _output_pages(sitemap: AbstractSitemap, strip_prefix: str = ""): for page in sitemap.all_pages(): sys.stdout.write(_strip_url(page.url, strip_prefix) + "\n") + def ls(args): tree = sitemap_tree_for_homepage( args.url, diff --git a/usp/cli/_util.py b/usp/cli/_util.py index 20c8cda..88fd156 100644 --- a/usp/cli/_util.py +++ b/usp/cli/_util.py @@ -1,5 +1,6 @@ from typing import Dict + def format_help(choices: Dict[str, str], opt_help: str) -> str: """Generate help text for argparse choices. diff --git a/usp/cli/cli.py b/usp/cli/cli.py index 7193b71..fa0e17e 100644 --- a/usp/cli/cli.py +++ b/usp/cli/cli.py @@ -3,11 +3,14 @@ from usp.cli import _ls as ls_cmd from usp import __version__ + def main(): parser = ArgumentParser(prog="usp", description="Ultimate Sitemap Parser") - parser.add_argument("--version", "-v", action="version", version=f"%(prog)s v{__version__}") + parser.add_argument( + "--version", "-v", action="version", version=f"%(prog)s v{__version__}" + ) - subparsers = parser.add_subparsers(required=False, title="commands", metavar='') + subparsers = parser.add_subparsers(required=False, title="commands", metavar="") ls_cmd.register(subparsers) args = parser.parse_args() @@ -21,4 +24,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/usp/tree.py b/usp/tree.py index 3a7ad23..92f39ce 100644 --- a/usp/tree.py +++ b/usp/tree.py @@ -36,9 +36,10 @@ def sitemap_tree_for_homepage( - homepage_url: str, web_client: Optional[AbstractWebClient] = None, - use_robots: bool = True, - use_known_paths: bool = True + homepage_url: str, + web_client: Optional[AbstractWebClient] = None, + use_robots: bool = True, + use_known_paths: bool = True, ) -> AbstractSitemap: """ Using a homepage URL, fetch the tree of sitemaps and pages listed in them. From 80c2e3c83c3e4c952d7d04374a6352fd94161421 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 31 Aug 2024 12:39:58 +0100 Subject: [PATCH 37/79] Fix cli version arg order --- usp/cli/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usp/cli/cli.py b/usp/cli/cli.py index fa0e17e..f727ccf 100644 --- a/usp/cli/cli.py +++ b/usp/cli/cli.py @@ -7,7 +7,7 @@ def main(): parser = ArgumentParser(prog="usp", description="Ultimate Sitemap Parser") parser.add_argument( - "--version", "-v", action="version", version=f"%(prog)s v{__version__}" + "-v", "--version", action="version", version=f"%(prog)s v{__version__}" ) subparsers = parser.add_subparsers(required=False, title="commands", metavar="") From 66ff5d981350d4855343a36d76becb5646f02315 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 31 Aug 2024 13:23:40 +0100 Subject: [PATCH 38/79] Remove custom XML parser as defusedexpat doesn't actually exist any more --- usp/fetch_parse.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index c41405e..eadc26d 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -49,13 +49,6 @@ log = create_logger(__name__) -# TODO: defusedxml example -CUSTOM_XML_PARSE_CREATE: Optional[Callable[[], Any]] = None -"""Specify an alternate method to use when creating XML parsers. - -This method will be called with no arguments and must return an object with the same interface as :func:`xml.parsers.expat.ParserCreate`. -""" - class SitemapFetcher: """robots.txt / XML / plain text sitemap fetcher.""" @@ -282,12 +275,9 @@ def __init__( self._concrete_parser = None def sitemap(self) -> AbstractSitemap: - if CUSTOM_XML_PARSE_CREATE is not None: - parser = CUSTOM_XML_PARSE_CREATE() - else: - parser = xml.parsers.expat.ParserCreate( - namespace_separator=self.__XML_NAMESPACE_SEPARATOR - ) + parser = xml.parsers.expat.ParserCreate( + namespace_separator=self.__XML_NAMESPACE_SEPARATOR + ) parser.StartElementHandler = self._xml_element_start parser.EndElementHandler = self._xml_element_end parser.CharacterDataHandler = self._xml_char_data From 85c431c7b961181ba174b660938fabd586a6fdf9 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 31 Aug 2024 13:24:04 +0100 Subject: [PATCH 39/79] Make ls url stripping the default --- usp/cli/_ls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/usp/cli/_ls.py b/usp/cli/_ls.py index 6be8ebe..770b688 100644 --- a/usp/cli/_ls.py +++ b/usp/cli/_ls.py @@ -42,12 +42,12 @@ def register(subparsers): ) ls_parser.add_argument( "-u", - "--keep-url", + "--strip-url", action="store_true", - help="don't strip the supplied URL from each page and sitemap URL", + help="strip the supplied URL from each page and sitemap URL", ) ls_parser.set_defaults( - page_only=False, no_robots=False, no_known=False, keep_url=False + no_robots=False, no_known=False, strip_url=False ) ls_parser.set_defaults(func=ls) @@ -94,7 +94,7 @@ def ls(args): ) strip_prefix = "" - if not args.keep_url: + if args.strip_url: strip_prefix = tree.url if args.format == "pages": From 6385ddb50eebf1ac34938914fe9215a26af3cfc0 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 31 Aug 2024 15:15:21 +0100 Subject: [PATCH 40/79] Count recursion depth for robots.txt sitemaps --- tests/test_tree.py | 59 +++++++++++++++++++++++++++++++++++++++++++++- usp/cli/_ls.py | 4 +--- usp/fetch_parse.py | 22 ++++++++++------- 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/tests/test_tree.py b/tests/test_tree.py index 3b995c2..a844dca 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -27,7 +27,6 @@ # FIXME various exotic properties # FIXME XML vulnerabilities with Expat -# FIXME max. recursion level # FIXME tests responses that are too big @@ -1379,3 +1378,61 @@ def test_sitemap_tree_for_homepage_utf8_bom(self, requests_mock): actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) assert len(list(actual_sitemap_tree.all_pages())) == 1 assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 + + def test_max_recursion_level_xml(self, requests_mock): + robots_txt_body = textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + """ + ).strip() + + sitemap_index_body = textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/sitemap.xml + 2024-01-01 + + + """ + ).strip() + + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=robots_txt_body, + ) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + headers={"Content-Type": "application/xml"}, + text=sitemap_index_body, + ) + + tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) + sitemaps = list(tree.all_sitemaps()) + + assert type(sitemaps[-1]) is InvalidSitemap + + def test_max_recursion_level_robots(self, requests_mock): + requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) + robots_txt_body = textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/robots.txt + """ + ).strip() + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=robots_txt_body, + ) + tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) + sitemaps = list(tree.all_sitemaps()) + assert type(sitemaps[-1]) is InvalidSitemap diff --git a/usp/cli/_ls.py b/usp/cli/_ls.py index 770b688..83c214c 100644 --- a/usp/cli/_ls.py +++ b/usp/cli/_ls.py @@ -46,9 +46,7 @@ def register(subparsers): action="store_true", help="strip the supplied URL from each page and sitemap URL", ) - ls_parser.set_defaults( - no_robots=False, no_known=False, strip_url=False - ) + ls_parser.set_defaults(no_robots=False, no_known=False, strip_url=False) ls_parser.set_defaults(func=ls) diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index eadc26d..a0fdeea 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -12,7 +12,7 @@ import xml.parsers.expat from collections import OrderedDict from decimal import Decimal -from typing import Any, Optional, Dict, Callable +from typing import Optional, Dict from .exceptions import SitemapException, SitemapXMLParsingException from .helpers import ( @@ -58,7 +58,7 @@ class SitemapFetcher: Spec says it might be up to 50 MB but let's go for the full 100 MB here.""" - __MAX_RECURSION_LEVEL = 10 + __MAX_RECURSION_LEVEL = 11 """Max. recursion level in iterating over sub-sitemaps.""" __slots__ = [ @@ -210,12 +210,18 @@ def sitemap(self) -> AbstractSitemap: sub_sitemaps = [] for sitemap_url in sitemap_urls.keys(): - fetcher = SitemapFetcher( - url=sitemap_url, - recursion_level=self._recursion_level, - web_client=self._web_client, - ) - fetched_sitemap = fetcher.sitemap() + try: + fetcher = SitemapFetcher( + url=sitemap_url, + recursion_level=self._recursion_level + 1, + web_client=self._web_client, + ) + fetched_sitemap = fetcher.sitemap() + except Exception as ex: + fetched_sitemap = InvalidSitemap( + url=sitemap_url, + reason=f"Unable to add sub-sitemap from URL {sitemap_url}: {str(ex)}", + ) sub_sitemaps.append(fetched_sitemap) index_sitemap = IndexRobotsTxtSitemap(url=self._url, sub_sitemaps=sub_sitemaps) From c9e83b0b4af9c6a8f828f9322f4ce845db9ea33e Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 31 Aug 2024 16:34:48 +0100 Subject: [PATCH 41/79] Split tree tests --- tests/test_tree.py | 1438 ---------------------- tests/tree/__init__.py | 0 tests/tree/base.py | 40 + tests/tree/test_basic.py | 547 ++++++++ tests/tree/test_edges.py | 138 +++ tests/tree/test_plain_text.py | 104 ++ tests/tree/test_robots.py | 135 ++ tests/tree/test_rss_atom.py | 363 ++++++ tests/tree/test_xml.py | 228 ++++ tests/web_client/test_requests_client.py | 2 +- 10 files changed, 1556 insertions(+), 1439 deletions(-) delete mode 100644 tests/test_tree.py create mode 100644 tests/tree/__init__.py create mode 100644 tests/tree/base.py create mode 100644 tests/tree/test_basic.py create mode 100644 tests/tree/test_edges.py create mode 100644 tests/tree/test_plain_text.py create mode 100644 tests/tree/test_robots.py create mode 100644 tests/tree/test_rss_atom.py create mode 100644 tests/tree/test_xml.py diff --git a/tests/test_tree.py b/tests/test_tree.py deleted file mode 100644 index a844dca..0000000 --- a/tests/test_tree.py +++ /dev/null @@ -1,1438 +0,0 @@ -import datetime -import difflib -import textwrap -from decimal import Decimal -from email.utils import format_datetime -import requests_mock as rq_mock -from dateutil.tz import tzoffset - -from tests.helpers import gzip -from usp.log import create_logger -from usp.objects.page import ( - SitemapPage, - SitemapNewsStory, - SitemapPageChangeFrequency, -) -from usp.objects.sitemap import ( - IndexRobotsTxtSitemap, - PagesXMLSitemap, - IndexXMLSitemap, - InvalidSitemap, - PagesTextSitemap, - IndexWebsiteSitemap, - PagesRSSSitemap, - PagesAtomSitemap, -) -from usp.tree import sitemap_tree_for_homepage - -# FIXME various exotic properties -# FIXME XML vulnerabilities with Expat -# FIXME tests responses that are too big - - -log = create_logger(__name__) - - -class TestSitemapTree: - TEST_BASE_URL = "http://test_ultimate-sitemap-parser.com" # mocked by HTTPretty - - # Publication / "last modified" date - TEST_DATE_DATETIME = datetime.datetime( - year=2009, - month=12, - day=17, - hour=12, - minute=4, - second=56, - tzinfo=tzoffset(None, 7200), - ) - TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat() - """Test string date formatted as ISO 8601 (for XML and Atom 0.3 / 1.0 sitemaps).""" - - TEST_DATE_STR_RFC2822 = format_datetime(TEST_DATE_DATETIME) - """Test string date formatted as RFC 2822 (for RSS 2.0 sitemaps).""" - - TEST_PUBLICATION_NAME = "Test publication" - TEST_PUBLICATION_LANGUAGE = "en" - - @staticmethod - def fallback_to_404_not_found_matcher(request): - """Reply with "404 Not Found" to unmatched URLs instead of throwing NoMockAddress.""" - return rq_mock.create_response( - request, - status_code=404, - reason="Not Found", - headers={"Content-Type": "text/html"}, - text="

404 Not Found!

", - ) - - # noinspection DuplicatedCode - def test_sitemap_tree_for_homepage(self, requests_mock): - """Test sitemap_tree_for_homepage().""" - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap_pages.xml - - # Intentionally spelled as "Site-map" as Google tolerates this: - # https://github.com/google/robotstxt/blob/master/robots.cc#L703 - Site-map: {self.TEST_BASE_URL}/sitemap_news_index_1.xml - """ - ).strip(), - ) - - # One sitemap for random static pages - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_pages.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/about.html - {self.TEST_DATE_STR_ISO8601} - monthly - 0.8 - - - {self.TEST_BASE_URL}/contact.html - {self.TEST_DATE_STR_ISO8601} - - - when we feel like it - - - 1.1 - - - - """ - ).strip(), - ) - - # Index sitemap pointing to sitemaps with stories - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_index_1.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/sitemap_news_1.xml - {self.TEST_DATE_STR_ISO8601} - - - {self.TEST_BASE_URL}/sitemap_news_index_2.xml - {self.TEST_DATE_STR_ISO8601} - - - """ - ).strip(), - ) - - # First sitemap with actual stories - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_1.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - - {self.TEST_BASE_URL}/news/foo.html - - - - - - - - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - Foo <foo> - - - - - - {self.TEST_BASE_URL}/news/bar.html - - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - Bar & bar - - - - - """ - ).strip(), - ) - - # Another index sitemap pointing to a second sitemaps with stories - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_index_2.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - - - {self.TEST_BASE_URL}/sitemap_news_2.xml - {self.TEST_DATE_STR_ISO8601} - - - - - {self.TEST_BASE_URL}/sitemap_news_missing.xml - {self.TEST_DATE_STR_ISO8601} - - - - """ - ).strip(), - ) - - # Second sitemap with actual stories - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_2.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - - - - {self.TEST_BASE_URL}/news/bar.html - - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - - - - Bar & bar - - - - - {self.TEST_BASE_URL}/news/baz.html - - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - - - - - - """ - ).strip(), - ) - - # Nonexistent sitemap - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_missing.xml", - status_code=404, - reason="Not Found", - headers={"Content-Type": "text/html"}, - text="

404 Not Found!

", - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[ - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_pages.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/about.html", - last_modified=self.TEST_DATE_DATETIME, - news_story=None, - change_frequency=SitemapPageChangeFrequency.MONTHLY, - priority=Decimal("0.8"), - ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/contact.html", - last_modified=self.TEST_DATE_DATETIME, - news_story=None, - # Invalid input -- should be reset to "always" - change_frequency=SitemapPageChangeFrequency.ALWAYS, - # Invalid input -- should be reset to 0.5 (the default as per the spec) - priority=Decimal("0.5"), - ), - ], - ), - IndexXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_index_1.xml", - sub_sitemaps=[ - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_1.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/foo.html", - news_story=SitemapNewsStory( - title="Foo ", - publish_date=self.TEST_DATE_DATETIME, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - ), - ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/bar.html", - news_story=SitemapNewsStory( - title="Bar & bar", - publish_date=self.TEST_DATE_DATETIME, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - ), - ), - ], - ), - IndexXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_index_2.xml", - sub_sitemaps=[ - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_2.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/bar.html", - news_story=SitemapNewsStory( - title="Bar & bar", - publish_date=self.TEST_DATE_DATETIME, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - ), - ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/baz.html", - news_story=SitemapNewsStory( - title="Bąž", - publish_date=self.TEST_DATE_DATETIME, - publication_name=self.TEST_PUBLICATION_NAME, - publication_language=self.TEST_PUBLICATION_LANGUAGE, - ), - ), - ], - ), - InvalidSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_news_missing.xml", - reason=( - f"Unable to fetch sitemap from {self.TEST_BASE_URL}/sitemap_news_missing.xml: " - "404 Not Found" - ), - ), - ], - ), - ], - ), - ], - ) - ], - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - - expected_lines = str(expected_sitemap_tree).split() - actual_lines = str(actual_sitemap_tree).split() - diff = difflib.ndiff(expected_lines, actual_lines) - diff_str = "\n".join(diff) - - assert expected_sitemap_tree == actual_sitemap_tree, diff_str - - assert len(list(actual_sitemap_tree.all_pages())) == 6 - assert len(list(actual_sitemap_tree.all_sitemaps())) == 7 - - def test_sitemap_tree_for_homepage_gzip(self, requests_mock): - """Test sitemap_tree_for_homepage() with gzipped sitemaps.""" - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap_1.gz - Sitemap: {self.TEST_BASE_URL}/sitemap_2.dat - Sitemap: {self.TEST_BASE_URL}/sitemap_3.xml.gz - """ - ).strip(), - ) - - # Gzipped sitemap without correct HTTP header but with .gz extension - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_1.gz", - content=gzip( - textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/foo.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - Foo <foo> - - - - """ - ).strip() - ), - ) - - # Gzipped sitemap with correct HTTP header but without .gz extension - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_2.dat", - headers={"Content-Type": "application/x-gzip"}, - content=gzip( - textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/bar.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - - - - - """ - ).strip() - ), - ) - - # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_3.xml.gz", - headers={"Content-Type": "application/x-gzip"}, - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/baz.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - - - - - """ - ).strip(), - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - - # Don't do an in-depth check, we just need to make sure that gunzip works - assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) - assert len(actual_sitemap_tree.sub_sitemaps) == 1 - - assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) - # noinspection PyUnresolvedReferences - assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3 - - # noinspection PyUnresolvedReferences - sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] - assert isinstance(sitemap_1, PagesXMLSitemap) - assert len(sitemap_1.pages) == 1 - - # noinspection PyUnresolvedReferences - sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] - assert isinstance(sitemap_2, PagesXMLSitemap) - assert len(sitemap_2.pages) == 1 - - # noinspection PyUnresolvedReferences - sitemap_3 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[2] - assert isinstance(sitemap_3, PagesXMLSitemap) - assert len(sitemap_3.pages) == 1 - - def test_sitemap_tree_for_homepage_plain_text(self, requests_mock): - """Test sitemap_tree_for_homepage() with plain text sitemaps.""" - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap_1.txt - Sitemap: {self.TEST_BASE_URL}/sitemap_2.txt.dat - """ - ).strip(), - ) - - # Plain text uncompressed sitemap (no Content-Type header) - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_1.txt", - text=textwrap.dedent( - f""" - - {self.TEST_BASE_URL}/news/foo.html - - - {self.TEST_BASE_URL}/news/bar.html - - Some other stuff which totally doesn't look like an URL - """ - ).strip(), - ) - - # Plain text compressed sitemap without .gz extension - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_2.txt.dat", - headers={"Content-Type": "application/x-gzip"}, - content=gzip( - textwrap.dedent( - f""" - {self.TEST_BASE_URL}/news/bar.html - {self.TEST_BASE_URL}/news/baz.html - """ - ).strip() - ), - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - - assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) - assert len(actual_sitemap_tree.sub_sitemaps) == 1 - - assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) - # noinspection PyUnresolvedReferences - assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2 - - # noinspection PyUnresolvedReferences - sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] - assert isinstance(sitemap_1, PagesTextSitemap) - assert len(sitemap_1.pages) == 2 - - # noinspection PyUnresolvedReferences - sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] - assert isinstance(sitemap_2, PagesTextSitemap) - assert len(sitemap_2.pages) == 2 - - pages = list(actual_sitemap_tree.all_pages()) - assert len(pages) == 4 - assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/foo.html") in pages - assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/bar.html") in pages - assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/baz.html") in pages - - assert len(list(actual_sitemap_tree.all_sitemaps())) == 3 - - # noinspection DuplicatedCode - def test_sitemap_tree_for_homepage_rss_atom(self, requests_mock): - """Test sitemap_tree_for_homepage() with RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml - Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml - Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml - """ - ).strip(), - ) - - # RSS 2.0 sitemap - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_rss.xml", - headers={"Content-Type": "application/rss+xml"}, - text=textwrap.dedent( - f""" - - - - Test RSS 2.0 feed - This is a test RSS 2.0 feed. - {self.TEST_BASE_URL} - {self.TEST_DATE_STR_RFC2822} - - - Test RSS 2.0 story #1 - This is a test RSS 2.0 story #1. - {self.TEST_BASE_URL}/rss_story_1.html - {self.TEST_BASE_URL}/rss_story_1.html - {self.TEST_DATE_STR_RFC2822} - - - - Test RSS 2.0 story #2 - This is a test RSS 2.0 story #2. - {self.TEST_BASE_URL}/rss_story_2.html - {self.TEST_BASE_URL}/rss_story_2.html - {self.TEST_DATE_STR_RFC2822} - - - - - """ - ).strip(), - ) - - # Atom 0.3 sitemap - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", - headers={"Content-Type": "application/atom+xml"}, - text=textwrap.dedent( - f""" - - - Test Atom 0.3 feed - - {self.TEST_DATE_STR_ISO8601} - - - Test Atom 0.3 story #1 - - {self.TEST_BASE_URL}/atom_0_3_story_1.html - {self.TEST_DATE_STR_ISO8601} - - - - Test Atom 0.3 story #2 - - {self.TEST_BASE_URL}/atom_0_3_story_2.html - {self.TEST_DATE_STR_ISO8601} - - - - """ - ).strip(), - ) - - # Atom 1.0 sitemap - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", - headers={"Content-Type": "application/atom+xml"}, - text=textwrap.dedent( - f""" - - - Test Atom 1.0 feed - This is a test Atom 1.0 feed. - - - {self.TEST_BASE_URL} - {self.TEST_DATE_STR_ISO8601} - - - Test Atom 1.0 story #1 - - - - {self.TEST_BASE_URL}/atom_1_0_story_1.html - {self.TEST_DATE_STR_ISO8601} - This is test atom 1.0 story #1. - -
-

This is test atom 1.0 story #1.

-
-
- - John Doe - johndoe@example.com - -
- - - Test Atom 1.0 story #2 - - - - {self.TEST_BASE_URL}/atom_1_0_story_2.html - {self.TEST_DATE_STR_ISO8601} - This is test atom 1.0 story #2. - -
-

This is test atom 1.0 story #2.

-
-
- - John Doe - johndoe@example.com - -
- -
- """ - ).strip(), - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[ - PagesRSSSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/rss_story_1.html", - news_story=SitemapNewsStory( - title="Test RSS 2.0 story #1", - publish_date=self.TEST_DATE_DATETIME, - ), - ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/rss_story_2.html", - news_story=SitemapNewsStory( - title="Test RSS 2.0 story #2", - publish_date=self.TEST_DATE_DATETIME, - ), - ), - ], - ), - PagesAtomSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/atom_0_3_story_1.html", - news_story=SitemapNewsStory( - title="Test Atom 0.3 story #1", - publish_date=self.TEST_DATE_DATETIME, - ), - ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/atom_0_3_story_2.html", - news_story=SitemapNewsStory( - title="Test Atom 0.3 story #2", - publish_date=self.TEST_DATE_DATETIME, - ), - ), - ], - ), - PagesAtomSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/atom_1_0_story_1.html", - news_story=SitemapNewsStory( - title="Test Atom 1.0 story #1", - publish_date=self.TEST_DATE_DATETIME, - ), - ), - SitemapPage( - url=f"{self.TEST_BASE_URL}/atom_1_0_story_2.html", - news_story=SitemapNewsStory( - title="Test Atom 1.0 story #2", - publish_date=self.TEST_DATE_DATETIME, - ), - ), - ], - ), - ], - ) - ], - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - - expected_lines = str(expected_sitemap_tree).split() - actual_lines = str(actual_sitemap_tree).split() - diff = difflib.ndiff(expected_lines, actual_lines) - diff_str = "\n".join(diff) - - assert expected_sitemap_tree == actual_sitemap_tree, diff_str - - assert len(list(actual_sitemap_tree.all_pages())) == 6 - assert len(list(actual_sitemap_tree.all_sitemaps())) == 4 - - def test_sitemap_tree_for_homepage_rss_atom_empty(self, requests_mock): - """Test sitemap_tree_for_homepage() with empty RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml - Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml - Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml - """ - ).strip(), - ) - - # RSS 2.0 sitemap - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_rss.xml", - headers={"Content-Type": "application/rss+xml"}, - text=textwrap.dedent( - f""" - - - - Test RSS 2.0 feed - This is a test RSS 2.0 feed. - {self.TEST_BASE_URL} - {self.TEST_DATE_STR_RFC2822} - - - """ - ).strip(), - ) - - # Atom 0.3 sitemap - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", - headers={"Content-Type": "application/atom+xml"}, - text=textwrap.dedent( - f""" - - - Test Atom 0.3 feed - - {self.TEST_DATE_STR_ISO8601} - - """ - ).strip(), - ) - - # Atom 1.0 sitemap - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", - headers={"Content-Type": "application/atom+xml"}, - text=textwrap.dedent( - f""" - - - Test Atom 1.0 feed - This is a test Atom 1.0 feed. - - - {self.TEST_BASE_URL} - {self.TEST_DATE_STR_ISO8601} - - """ - ).strip(), - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[ - PagesRSSSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", - pages=[], - ), - PagesAtomSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", - pages=[], - ), - PagesAtomSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", - pages=[], - ), - ], - ) - ], - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - - assert expected_sitemap_tree == actual_sitemap_tree - - assert len(list(actual_sitemap_tree.all_pages())) == 0 - assert len(list(actual_sitemap_tree.all_sitemaps())) == 4 - - def test_sitemap_tree_for_homepage_prematurely_ending_xml(self, requests_mock): - """Test sitemap_tree_for_homepage() with clipped XML. - - Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the - server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with - this behavior, so we have to support this too. - """ - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap.xml - """ - ).strip(), - ) - - requests_mock.get( - self.TEST_BASE_URL + "/sitemap.xml", - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/first.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - First story - - - - {self.TEST_BASE_URL}/news/second.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - Second story - - - - - - {self.TEST_BASE_URL}/news/third.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - - - - {self.TEST_BASE_URL}/news/public.html - - - """ - ).strip(), - ) - - # Private sitemap (to be discovered by trying out a few paths) - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_index.xml", - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/private.html - - - """ - ).strip(), - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[ - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_public.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/public.html", - ), - ], - ), - ], - ), - PagesXMLSitemap( - url=f"{self.TEST_BASE_URL}/sitemap_index.xml", - pages=[ - SitemapPage( - url=f"{self.TEST_BASE_URL}/news/private.html", - ), - ], - ), - ], - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - - assert expected_sitemap_tree == actual_sitemap_tree - - def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self, requests_mock): - """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt.""" - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": ""}, - text=textwrap.dedent( - """ - User-agent: * - Disallow: /whatever - """.format() - ).strip(), - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[ - IndexRobotsTxtSitemap( - url=f"{self.TEST_BASE_URL}/robots.txt", - sub_sitemaps=[], - ) - ], - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - - assert expected_sitemap_tree == actual_sitemap_tree - - def test_sitemap_tree_for_homepage_no_robots_txt(self, requests_mock): - """Test sitemap_tree_for_homepage() with no robots.txt.""" - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - # Nonexistent robots.txt - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - status_code=404, - reason="Not Found", - headers={"Content-Type": "text/html"}, - text="

404 Not Found!

", - ) - - expected_sitemap_tree = IndexWebsiteSitemap( - url=f"{self.TEST_BASE_URL}/", - sub_sitemaps=[], - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - - assert expected_sitemap_tree == actual_sitemap_tree - - def test_sitemap_tree_for_homepage_huge_sitemap(self, requests_mock): - """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling).""" - - page_count = 1000 - - sitemap_xml = """ - - """ - for x in range(page_count): - sitemap_xml += f""" - - {self.TEST_BASE_URL}/news/page_{x}.html - - - - - - - - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - Foo <foo> - - - """ - - sitemap_xml += "" - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap.xml.gz - """ - ).strip(), - ) - - requests_mock.get( - self.TEST_BASE_URL + "/sitemap.xml.gz", - headers={"Content-Type": "application/x-gzip"}, - content=gzip(sitemap_xml), - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - - assert len(list(actual_sitemap_tree.all_pages())) == page_count - assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 - - def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self, requests_mock): - """Test sitemap_tree_for_homepage() with weird (but valid) spacing.""" - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - robots_txt_body = "" - robots_txt_body += "User-agent: *\n" - # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL - robots_txt_body += f" Sitemap:{self.TEST_BASE_URL}/sitemap.xml " - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=robots_txt_body, - ) - - requests_mock.get( - self.TEST_BASE_URL + "/sitemap.xml", - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/first.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - First story - - - - """ - ).strip(), - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - assert len(list(actual_sitemap_tree.all_pages())) == 1 - assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 - - def test_sitemap_tree_for_homepage_utf8_bom(self, requests_mock): - """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap.""" - - robots_txt_body = textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap.xml - """ - ).strip() - - sitemap_xml_body = textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/news/first.html - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - First story - - - - """ - ).strip() - - robots_txt_body_encoded = robots_txt_body.encode("utf-8-sig") - sitemap_xml_body_encoded = sitemap_xml_body.encode("utf-8-sig") - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - content=robots_txt_body_encoded, - ) - - requests_mock.get( - self.TEST_BASE_URL + "/sitemap.xml", - content=sitemap_xml_body_encoded, - ) - - actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) - assert len(list(actual_sitemap_tree.all_pages())) == 1 - assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 - - def test_max_recursion_level_xml(self, requests_mock): - robots_txt_body = textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap.xml - """ - ).strip() - - sitemap_index_body = textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/sitemap.xml - 2024-01-01 - - - """ - ).strip() - - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=robots_txt_body, - ) - requests_mock.get( - self.TEST_BASE_URL + "/sitemap.xml", - headers={"Content-Type": "application/xml"}, - text=sitemap_index_body, - ) - - tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) - sitemaps = list(tree.all_sitemaps()) - - assert type(sitemaps[-1]) is InvalidSitemap - - def test_max_recursion_level_robots(self, requests_mock): - requests_mock.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) - robots_txt_body = textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/robots.txt - """ - ).strip() - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=robots_txt_body, - ) - tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) - sitemaps = list(tree.all_sitemaps()) - assert type(sitemaps[-1]) is InvalidSitemap diff --git a/tests/tree/__init__.py b/tests/tree/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tree/base.py b/tests/tree/base.py new file mode 100644 index 0000000..502f9e0 --- /dev/null +++ b/tests/tree/base.py @@ -0,0 +1,40 @@ +import datetime +from email.utils import format_datetime + +from dateutil.tz import tzoffset +import requests_mock as rq_mock + + +class TreeTestBase: + TEST_BASE_URL = "http://test_ultimate-sitemap-parser.com" # mocked by HTTPretty + + + # Publication / "last modified" date + TEST_DATE_DATETIME = datetime.datetime( + year=2009, + month=12, + day=17, + hour=12, + minute=4, + second=56, + tzinfo=tzoffset(None, 7200), + ) + TEST_DATE_STR_RFC2822 = format_datetime(TEST_DATE_DATETIME) + """Test string date formatted as RFC 2822 (for RSS 2.0 sitemaps).""" + TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat() + """Test string date formatted as ISO 8601 (for XML and Atom 0.3 / 1.0 sitemaps).""" + + + TEST_PUBLICATION_LANGUAGE = "en" + TEST_PUBLICATION_NAME = "Test publication" + + @staticmethod + def fallback_to_404_not_found_matcher(request): + """Reply with "404 Not Found" to unmatched URLs instead of throwing NoMockAddress.""" + return rq_mock.create_response( + request, + status_code=404, + reason="Not Found", + headers={"Content-Type": "text/html"}, + text="

404 Not Found!

", + ) \ No newline at end of file diff --git a/tests/tree/test_basic.py b/tests/tree/test_basic.py new file mode 100644 index 0000000..e0412cf --- /dev/null +++ b/tests/tree/test_basic.py @@ -0,0 +1,547 @@ +from decimal import Decimal +import difflib +import textwrap +from tests.helpers import gzip + + +from tests.tree.base import TreeTestBase + +from usp.objects.sitemap import ( + IndexRobotsTxtSitemap, + PagesXMLSitemap, + IndexXMLSitemap, + InvalidSitemap, + PagesTextSitemap, + IndexWebsiteSitemap, + PagesRSSSitemap, + PagesAtomSitemap, +) + +from usp.objects.page import ( + SitemapPage, + SitemapNewsStory, + SitemapPageChangeFrequency, +) +from usp.tree import sitemap_tree_for_homepage + + +class TestTreeBasic(TreeTestBase): + def test_sitemap_tree_for_homepage(self, requests_mock): + """Test sitemap_tree_for_homepage().""" + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_pages.xml + + # Intentionally spelled as "Site-map" as Google tolerates this: + # https://github.com/google/robotstxt/blob/master/robots.cc#L703 + Site-map: {self.TEST_BASE_URL}/sitemap_news_index_1.xml + """ + ).strip(), + ) + + # One sitemap for random static pages + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_pages.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/about.html + {self.TEST_DATE_STR_ISO8601} + monthly + 0.8 + + + {self.TEST_BASE_URL}/contact.html + {self.TEST_DATE_STR_ISO8601} + + + when we feel like it + + + 1.1 + + + + """ + ).strip(), + ) + + # Index sitemap pointing to sitemaps with stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_index_1.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/sitemap_news_1.xml + {self.TEST_DATE_STR_ISO8601} + + + {self.TEST_BASE_URL}/sitemap_news_index_2.xml + {self.TEST_DATE_STR_ISO8601} + + + """ + ).strip(), + ) + + # First sitemap with actual stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_1.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + + {self.TEST_BASE_URL}/news/foo.html + + + + + + + + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + Foo <foo> + + + + + + {self.TEST_BASE_URL}/news/bar.html + + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + Bar & bar + + + + + """ + ).strip(), + ) + + # Another index sitemap pointing to a second sitemaps with stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_index_2.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + + + {self.TEST_BASE_URL}/sitemap_news_2.xml + {self.TEST_DATE_STR_ISO8601} + + + + + {self.TEST_BASE_URL}/sitemap_news_missing.xml + {self.TEST_DATE_STR_ISO8601} + + + + """ + ).strip(), + ) + + # Second sitemap with actual stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_2.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + + + + {self.TEST_BASE_URL}/news/bar.html + + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + + + + Bar & bar + + + + + {self.TEST_BASE_URL}/news/baz.html + + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + + + + + + """ + ).strip(), + ) + + # Nonexistent sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_missing.xml", + status_code=404, + reason="Not Found", + headers={"Content-Type": "text/html"}, + text="

404 Not Found!

", + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[ + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_pages.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/about.html", + last_modified=self.TEST_DATE_DATETIME, + news_story=None, + change_frequency=SitemapPageChangeFrequency.MONTHLY, + priority=Decimal("0.8"), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/contact.html", + last_modified=self.TEST_DATE_DATETIME, + news_story=None, + # Invalid input -- should be reset to "always" + change_frequency=SitemapPageChangeFrequency.ALWAYS, + # Invalid input -- should be reset to 0.5 (the default as per the spec) + priority=Decimal("0.5"), + ), + ], + ), + IndexXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_index_1.xml", + sub_sitemaps=[ + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_1.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/foo.html", + news_story=SitemapNewsStory( + title="Foo ", + publish_date=self.TEST_DATE_DATETIME, + publication_name=self.TEST_PUBLICATION_NAME, + publication_language=self.TEST_PUBLICATION_LANGUAGE, + ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/bar.html", + news_story=SitemapNewsStory( + title="Bar & bar", + publish_date=self.TEST_DATE_DATETIME, + publication_name=self.TEST_PUBLICATION_NAME, + publication_language=self.TEST_PUBLICATION_LANGUAGE, + ), + ), + ], + ), + IndexXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_index_2.xml", + sub_sitemaps=[ + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_2.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/bar.html", + news_story=SitemapNewsStory( + title="Bar & bar", + publish_date=self.TEST_DATE_DATETIME, + publication_name=self.TEST_PUBLICATION_NAME, + publication_language=self.TEST_PUBLICATION_LANGUAGE, + ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/baz.html", + news_story=SitemapNewsStory( + title="Bąž", + publish_date=self.TEST_DATE_DATETIME, + publication_name=self.TEST_PUBLICATION_NAME, + publication_language=self.TEST_PUBLICATION_LANGUAGE, + ), + ), + ], + ), + InvalidSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_news_missing.xml", + reason=( + f"Unable to fetch sitemap from {self.TEST_BASE_URL}/sitemap_news_missing.xml: " + "404 Not Found" + ), + ), + ], + ), + ], + ), + ], + ) + ], + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + expected_lines = str(expected_sitemap_tree).split() + actual_lines = str(actual_sitemap_tree).split() + print(actual_lines) + diff = difflib.ndiff(expected_lines, actual_lines) + diff_str = "\n".join(diff) + + assert expected_sitemap_tree == actual_sitemap_tree, diff_str + + assert len(list(actual_sitemap_tree.all_pages())) == 6 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 7 + + def test_sitemap_tree_for_homepage_gzip(self, requests_mock): + """Test sitemap_tree_for_homepage() with gzipped sitemaps.""" + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_1.gz + Sitemap: {self.TEST_BASE_URL}/sitemap_2.dat + Sitemap: {self.TEST_BASE_URL}/sitemap_3.xml.gz + """ + ).strip(), + ) + + # Gzipped sitemap without correct HTTP header but with .gz extension + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_1.gz", + content=gzip( + textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/foo.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + Foo <foo> + + + + """ + ).strip() + ), + ) + + # Gzipped sitemap with correct HTTP header but without .gz extension + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_2.dat", + headers={"Content-Type": "application/x-gzip"}, + content=gzip( + textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/bar.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + + + + + """ + ).strip() + ), + ) + + # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_3.xml.gz", + headers={"Content-Type": "application/x-gzip"}, + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/baz.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + + + + + """ + ).strip(), + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + # Don't do an in-depth check, we just need to make sure that gunzip works + assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) + assert len(actual_sitemap_tree.sub_sitemaps) == 1 + + assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) + # noinspection PyUnresolvedReferences + assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3 + + # noinspection PyUnresolvedReferences + sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] + assert isinstance(sitemap_1, PagesXMLSitemap) + assert len(sitemap_1.pages) == 1 + + # noinspection PyUnresolvedReferences + sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] + assert isinstance(sitemap_2, PagesXMLSitemap) + assert len(sitemap_2.pages) == 1 + + # noinspection PyUnresolvedReferences + sitemap_3 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[2] + assert isinstance(sitemap_3, PagesXMLSitemap) + assert len(sitemap_3.pages) == 1 + + def test_sitemap_tree_for_homepage_huge_sitemap(self, requests_mock): + """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling).""" + + page_count = 1000 + + sitemap_xml = """ + + """ + for x in range(page_count): + sitemap_xml += f""" + + {self.TEST_BASE_URL}/news/page_{x}.html + + + + + + + + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + Foo <foo> + + + """ + + sitemap_xml += "" + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap.xml.gz + """ + ).strip(), + ) + + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml.gz", + headers={"Content-Type": "application/x-gzip"}, + content=gzip(sitemap_xml), + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + assert len(list(actual_sitemap_tree.all_pages())) == page_count + assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 diff --git a/tests/tree/test_edges.py b/tests/tree/test_edges.py new file mode 100644 index 0000000..149d1c0 --- /dev/null +++ b/tests/tree/test_edges.py @@ -0,0 +1,138 @@ +from decimal import Decimal +import difflib +import textwrap +from tests.helpers import gzip + + +from tests.tree.base import TreeTestBase + +from usp.objects.sitemap import ( + IndexRobotsTxtSitemap, + PagesXMLSitemap, + IndexXMLSitemap, + InvalidSitemap, + PagesTextSitemap, + IndexWebsiteSitemap, + PagesRSSSitemap, + PagesAtomSitemap, +) + +from usp.objects.page import ( + SitemapPage, + SitemapNewsStory, + SitemapPageChangeFrequency, +) +from usp.tree import sitemap_tree_for_homepage + + +class TestTreeBasic(TreeTestBase): + def test_sitemap_tree_for_homepage_utf8_bom(self, requests_mock): + """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap.""" + + robots_txt_body = textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + """ + ).strip() + + sitemap_xml_body = textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/first.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + First story + + + + """ + ).strip() + + robots_txt_body_encoded = robots_txt_body.encode("utf-8-sig") + sitemap_xml_body_encoded = sitemap_xml_body.encode("utf-8-sig") + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + content=robots_txt_body_encoded, + ) + + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + content=sitemap_xml_body_encoded, + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + assert len(list(actual_sitemap_tree.all_pages())) == 1 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 + + def test_max_recursion_level_xml(self, requests_mock): + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=(textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + """ + ).strip()), + ) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + headers={"Content-Type": "application/xml"}, + text=(textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/sitemap.xml + 2024-01-01 + + + """ + ).strip()), + ) + + tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) + sitemaps = list(tree.all_sitemaps()) + + assert type(sitemaps[-1]) is InvalidSitemap + + + def test_max_recursion_level_robots(self, requests_mock): + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=(textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/robots.txt + """ + ).strip()), + ) + tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) + sitemaps = list(tree.all_sitemaps()) + assert type(sitemaps[-1]) is InvalidSitemap diff --git a/tests/tree/test_plain_text.py b/tests/tree/test_plain_text.py new file mode 100644 index 0000000..edd3908 --- /dev/null +++ b/tests/tree/test_plain_text.py @@ -0,0 +1,104 @@ +import textwrap + +from tests.helpers import gzip +from tests.tree.base import TreeTestBase +from usp.tree import sitemap_tree_for_homepage + +from usp.objects.sitemap import ( + IndexRobotsTxtSitemap, + PagesXMLSitemap, + IndexXMLSitemap, + InvalidSitemap, + PagesTextSitemap, + IndexWebsiteSitemap, + PagesRSSSitemap, + PagesAtomSitemap, +) + +from usp.objects.page import ( + SitemapPage, + SitemapNewsStory, + SitemapPageChangeFrequency, +) + +class TestTreeBasic(TreeTestBase): + def test_sitemap_tree_for_homepage_plain_text(self, requests_mock): + """Test sitemap_tree_for_homepage() with plain text sitemaps.""" + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_1.txt + Sitemap: {self.TEST_BASE_URL}/sitemap_2.txt.dat + """ + ).strip(), + ) + + # Plain text uncompressed sitemap (no Content-Type header) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_1.txt", + text=textwrap.dedent( + f""" + + {self.TEST_BASE_URL}/news/foo.html + + + {self.TEST_BASE_URL}/news/bar.html + + Some other stuff which totally doesn't look like an URL + """ + ).strip(), + ) + + # Plain text compressed sitemap without .gz extension + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_2.txt.dat", + headers={"Content-Type": "application/x-gzip"}, + content=gzip( + textwrap.dedent( + f""" + {self.TEST_BASE_URL}/news/bar.html + {self.TEST_BASE_URL}/news/baz.html + """ + ).strip() + ), + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) + assert len(actual_sitemap_tree.sub_sitemaps) == 1 + + assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) + # noinspection PyUnresolvedReferences + assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2 + + # noinspection PyUnresolvedReferences + sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] + assert isinstance(sitemap_1, PagesTextSitemap) + assert len(sitemap_1.pages) == 2 + + # noinspection PyUnresolvedReferences + sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] + assert isinstance(sitemap_2, PagesTextSitemap) + assert len(sitemap_2.pages) == 2 + + pages = list(actual_sitemap_tree.all_pages()) + assert len(pages) == 4 + assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/foo.html") in pages + assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/bar.html") in pages + assert SitemapPage(url=f"{self.TEST_BASE_URL}/news/baz.html") in pages + + assert len(list(actual_sitemap_tree.all_sitemaps())) == 3 diff --git a/tests/tree/test_robots.py b/tests/tree/test_robots.py new file mode 100644 index 0000000..e7a894b --- /dev/null +++ b/tests/tree/test_robots.py @@ -0,0 +1,135 @@ +import difflib +import textwrap + +from tests.helpers import gzip +from tests.tree.base import TreeTestBase +from usp.tree import sitemap_tree_for_homepage + +from usp.objects.sitemap import ( + IndexRobotsTxtSitemap, + PagesXMLSitemap, + IndexXMLSitemap, + InvalidSitemap, + PagesTextSitemap, + IndexWebsiteSitemap, + PagesRSSSitemap, + PagesAtomSitemap, +) + +from usp.objects.page import ( + SitemapPage, + SitemapNewsStory, + SitemapPageChangeFrequency, +) + +class TestTreeRobots(TreeTestBase): + def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self, requests_mock): + """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt.""" + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": ""}, + text=textwrap.dedent( + """ + User-agent: * + Disallow: /whatever + """.format() + ).strip(), + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[], + ) + ], + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + assert expected_sitemap_tree == actual_sitemap_tree + + def test_sitemap_tree_for_homepage_no_robots_txt(self, requests_mock): + """Test sitemap_tree_for_homepage() with no robots.txt.""" + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + # Nonexistent robots.txt + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + status_code=404, + reason="Not Found", + headers={"Content-Type": "text/html"}, + text="

404 Not Found!

", + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[], + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + assert expected_sitemap_tree == actual_sitemap_tree + + def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self, requests_mock): + """Test sitemap_tree_for_homepage() with weird (but valid) spacing.""" + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + robots_txt_body = "" + robots_txt_body += "User-agent: *\n" + # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL + robots_txt_body += f" Sitemap:{self.TEST_BASE_URL}/sitemap.xml " + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=robots_txt_body, + ) + + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/first.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + First story + + + + """ + ).strip(), + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + assert len(list(actual_sitemap_tree.all_pages())) == 1 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 2 diff --git a/tests/tree/test_rss_atom.py b/tests/tree/test_rss_atom.py new file mode 100644 index 0000000..49a4885 --- /dev/null +++ b/tests/tree/test_rss_atom.py @@ -0,0 +1,363 @@ +import difflib +import textwrap + +from tests.helpers import gzip +from tests.tree.base import TreeTestBase +from usp.tree import sitemap_tree_for_homepage + +from usp.objects.sitemap import ( + IndexRobotsTxtSitemap, + PagesXMLSitemap, + IndexXMLSitemap, + InvalidSitemap, + PagesTextSitemap, + IndexWebsiteSitemap, + PagesRSSSitemap, + PagesAtomSitemap, +) + +from usp.objects.page import ( + SitemapPage, + SitemapNewsStory, + SitemapPageChangeFrequency, +) + +class TestTreeBasic(TreeTestBase): + def test_sitemap_tree_for_homepage_rss_atom(self, requests_mock): + """Test sitemap_tree_for_homepage() with RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml + """ + ).strip(), + ) + + # RSS 2.0 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_rss.xml", + headers={"Content-Type": "application/rss+xml"}, + text=textwrap.dedent( + f""" + + + + Test RSS 2.0 feed + This is a test RSS 2.0 feed. + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_RFC2822} + + + Test RSS 2.0 story #1 + This is a test RSS 2.0 story #1. + {self.TEST_BASE_URL}/rss_story_1.html + {self.TEST_BASE_URL}/rss_story_1.html + {self.TEST_DATE_STR_RFC2822} + + + + Test RSS 2.0 story #2 + This is a test RSS 2.0 story #2. + {self.TEST_BASE_URL}/rss_story_2.html + {self.TEST_BASE_URL}/rss_story_2.html + {self.TEST_DATE_STR_RFC2822} + + + + + """ + ).strip(), + ) + + # Atom 0.3 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" + + + Test Atom 0.3 feed + + {self.TEST_DATE_STR_ISO8601} + + + Test Atom 0.3 story #1 + + {self.TEST_BASE_URL}/atom_0_3_story_1.html + {self.TEST_DATE_STR_ISO8601} + + + + Test Atom 0.3 story #2 + + {self.TEST_BASE_URL}/atom_0_3_story_2.html + {self.TEST_DATE_STR_ISO8601} + + + + """ + ).strip(), + ) + + # Atom 1.0 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" + + + Test Atom 1.0 feed + This is a test Atom 1.0 feed. + + + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_ISO8601} + + + Test Atom 1.0 story #1 + + + + {self.TEST_BASE_URL}/atom_1_0_story_1.html + {self.TEST_DATE_STR_ISO8601} + This is test atom 1.0 story #1. + +
+

This is test atom 1.0 story #1.

+
+
+ + John Doe + johndoe@example.com + +
+ + + Test Atom 1.0 story #2 + + + + {self.TEST_BASE_URL}/atom_1_0_story_2.html + {self.TEST_DATE_STR_ISO8601} + This is test atom 1.0 story #2. + +
+

This is test atom 1.0 story #2.

+
+
+ + John Doe + johndoe@example.com + +
+ +
+ """ + ).strip(), + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[ + PagesRSSSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/rss_story_1.html", + news_story=SitemapNewsStory( + title="Test RSS 2.0 story #1", + publish_date=self.TEST_DATE_DATETIME, + ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/rss_story_2.html", + news_story=SitemapNewsStory( + title="Test RSS 2.0 story #2", + publish_date=self.TEST_DATE_DATETIME, + ), + ), + ], + ), + PagesAtomSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/atom_0_3_story_1.html", + news_story=SitemapNewsStory( + title="Test Atom 0.3 story #1", + publish_date=self.TEST_DATE_DATETIME, + ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/atom_0_3_story_2.html", + news_story=SitemapNewsStory( + title="Test Atom 0.3 story #2", + publish_date=self.TEST_DATE_DATETIME, + ), + ), + ], + ), + PagesAtomSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/atom_1_0_story_1.html", + news_story=SitemapNewsStory( + title="Test Atom 1.0 story #1", + publish_date=self.TEST_DATE_DATETIME, + ), + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/atom_1_0_story_2.html", + news_story=SitemapNewsStory( + title="Test Atom 1.0 story #2", + publish_date=self.TEST_DATE_DATETIME, + ), + ), + ], + ), + ], + ) + ], + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + expected_lines = str(expected_sitemap_tree).split() + actual_lines = str(actual_sitemap_tree).split() + diff = difflib.ndiff(expected_lines, actual_lines) + diff_str = "\n".join(diff) + + assert expected_sitemap_tree == actual_sitemap_tree, diff_str + + assert len(list(actual_sitemap_tree.all_pages())) == 6 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 4 + + def test_sitemap_tree_for_homepage_rss_atom_empty(self, requests_mock): + """Test sitemap_tree_for_homepage() with empty RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_rss.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_0_3.xml + Sitemap: {self.TEST_BASE_URL}/sitemap_atom_1_0.xml + """ + ).strip(), + ) + + # RSS 2.0 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_rss.xml", + headers={"Content-Type": "application/rss+xml"}, + text=textwrap.dedent( + f""" + + + + Test RSS 2.0 feed + This is a test RSS 2.0 feed. + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_RFC2822} + + + """ + ).strip(), + ) + + # Atom 0.3 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_atom_0_3.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" + + + Test Atom 0.3 feed + + {self.TEST_DATE_STR_ISO8601} + + """ + ).strip(), + ) + + # Atom 1.0 sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_atom_1_0.xml", + headers={"Content-Type": "application/atom+xml"}, + text=textwrap.dedent( + f""" + + + Test Atom 1.0 feed + This is a test Atom 1.0 feed. + + + {self.TEST_BASE_URL} + {self.TEST_DATE_STR_ISO8601} + + """ + ).strip(), + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[ + PagesRSSSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_rss.xml", + pages=[], + ), + PagesAtomSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_atom_0_3.xml", + pages=[], + ), + PagesAtomSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_atom_1_0.xml", + pages=[], + ), + ], + ) + ], + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + assert expected_sitemap_tree == actual_sitemap_tree + + assert len(list(actual_sitemap_tree.all_pages())) == 0 + assert len(list(actual_sitemap_tree.all_sitemaps())) == 4 diff --git a/tests/tree/test_xml.py b/tests/tree/test_xml.py new file mode 100644 index 0000000..9a28956 --- /dev/null +++ b/tests/tree/test_xml.py @@ -0,0 +1,228 @@ +import difflib +import textwrap + +from tests.helpers import gzip +from tests.tree.base import TreeTestBase +from usp.tree import sitemap_tree_for_homepage + +from usp.objects.sitemap import ( + IndexRobotsTxtSitemap, + PagesXMLSitemap, + IndexXMLSitemap, + InvalidSitemap, + PagesTextSitemap, + IndexWebsiteSitemap, + PagesRSSSitemap, + PagesAtomSitemap, +) + +from usp.objects.page import ( + SitemapPage, + SitemapNewsStory, + SitemapPageChangeFrequency, +) + +class TestTreeXML(TreeTestBase): + def test_sitemap_tree_for_homepage_prematurely_ending_xml(self, requests_mock): + """Test sitemap_tree_for_homepage() with clipped XML. + + Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the + server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with + this behavior, so we have to support this too. + """ + + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + """ + ).strip(), + ) + + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/first.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + First story + + + + {self.TEST_BASE_URL}/news/second.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + Second story + + + + + + {self.TEST_BASE_URL}/news/third.html + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + + + + {self.TEST_BASE_URL}/news/public.html + + + """ + ).strip(), + ) + + # Private sitemap (to be discovered by trying out a few paths) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_index.xml", + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/news/private.html + + + """ + ).strip(), + ) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[ + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_public.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/public.html", + ), + ], + ), + ], + ), + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_index.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/news/private.html", + ), + ], + ), + ], + ) + + actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) + + assert expected_sitemap_tree == actual_sitemap_tree diff --git a/tests/web_client/test_requests_client.py b/tests/web_client/test_requests_client.py index 0a3e1a2..c4b18b8 100644 --- a/tests/web_client/test_requests_client.py +++ b/tests/web_client/test_requests_client.py @@ -4,7 +4,7 @@ import pytest -from usp.__about__ import __version__ +from usp import __version__ from usp.web_client.abstract_client import ( AbstractWebClientSuccessResponse, WebClientErrorResponse, From 280938c1249256106f6814e90ae03d19fd0ba30f Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Sat, 31 Aug 2024 16:52:48 +0100 Subject: [PATCH 42/79] Ruff --- tests/tree/base.py | 4 +--- tests/tree/test_basic.py | 3 --- tests/tree/test_edges.py | 40 +++++++++++++---------------------- tests/tree/test_plain_text.py | 8 +------ tests/tree/test_robots.py | 13 ------------ tests/tree/test_rss_atom.py | 7 +----- tests/tree/test_xml.py | 10 +-------- 7 files changed, 19 insertions(+), 66 deletions(-) diff --git a/tests/tree/base.py b/tests/tree/base.py index 502f9e0..ffe68f7 100644 --- a/tests/tree/base.py +++ b/tests/tree/base.py @@ -8,7 +8,6 @@ class TreeTestBase: TEST_BASE_URL = "http://test_ultimate-sitemap-parser.com" # mocked by HTTPretty - # Publication / "last modified" date TEST_DATE_DATETIME = datetime.datetime( year=2009, @@ -24,7 +23,6 @@ class TreeTestBase: TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat() """Test string date formatted as ISO 8601 (for XML and Atom 0.3 / 1.0 sitemaps).""" - TEST_PUBLICATION_LANGUAGE = "en" TEST_PUBLICATION_NAME = "Test publication" @@ -37,4 +35,4 @@ def fallback_to_404_not_found_matcher(request): reason="Not Found", headers={"Content-Type": "text/html"}, text="

404 Not Found!

", - ) \ No newline at end of file + ) diff --git a/tests/tree/test_basic.py b/tests/tree/test_basic.py index e0412cf..68a7d03 100644 --- a/tests/tree/test_basic.py +++ b/tests/tree/test_basic.py @@ -11,10 +11,7 @@ PagesXMLSitemap, IndexXMLSitemap, InvalidSitemap, - PagesTextSitemap, IndexWebsiteSitemap, - PagesRSSSitemap, - PagesAtomSitemap, ) from usp.objects.page import ( diff --git a/tests/tree/test_edges.py b/tests/tree/test_edges.py index 149d1c0..e4ad61e 100644 --- a/tests/tree/test_edges.py +++ b/tests/tree/test_edges.py @@ -1,27 +1,12 @@ -from decimal import Decimal -import difflib import textwrap -from tests.helpers import gzip from tests.tree.base import TreeTestBase from usp.objects.sitemap import ( - IndexRobotsTxtSitemap, - PagesXMLSitemap, - IndexXMLSitemap, InvalidSitemap, - PagesTextSitemap, - IndexWebsiteSitemap, - PagesRSSSitemap, - PagesAtomSitemap, ) -from usp.objects.page import ( - SitemapPage, - SitemapNewsStory, - SitemapPageChangeFrequency, -) from usp.tree import sitemap_tree_for_homepage @@ -88,20 +73,23 @@ def test_max_recursion_level_xml(self, requests_mock): requests_mock.get( self.TEST_BASE_URL + "/robots.txt", headers={"Content-Type": "text/plain"}, - text=(textwrap.dedent( - f""" + text=( + textwrap.dedent( + f""" User-agent: * Disallow: /whatever Sitemap: {self.TEST_BASE_URL}/sitemap.xml """ - ).strip()), + ).strip() + ), ) requests_mock.get( self.TEST_BASE_URL + "/sitemap.xml", headers={"Content-Type": "application/xml"}, - text=(textwrap.dedent( - f""" + text=( + textwrap.dedent( + f""" @@ -110,7 +98,8 @@ def test_max_recursion_level_xml(self, requests_mock): """ - ).strip()), + ).strip() + ), ) tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) @@ -118,20 +107,21 @@ def test_max_recursion_level_xml(self, requests_mock): assert type(sitemaps[-1]) is InvalidSitemap - def test_max_recursion_level_robots(self, requests_mock): requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) requests_mock.get( self.TEST_BASE_URL + "/robots.txt", headers={"Content-Type": "text/plain"}, - text=(textwrap.dedent( - f""" + text=( + textwrap.dedent( + f""" User-agent: * Disallow: /whatever Sitemap: {self.TEST_BASE_URL}/robots.txt """ - ).strip()), + ).strip() + ), ) tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) sitemaps = list(tree.all_sitemaps()) diff --git a/tests/tree/test_plain_text.py b/tests/tree/test_plain_text.py index edd3908..af59ec0 100644 --- a/tests/tree/test_plain_text.py +++ b/tests/tree/test_plain_text.py @@ -6,21 +6,15 @@ from usp.objects.sitemap import ( IndexRobotsTxtSitemap, - PagesXMLSitemap, - IndexXMLSitemap, - InvalidSitemap, PagesTextSitemap, IndexWebsiteSitemap, - PagesRSSSitemap, - PagesAtomSitemap, ) from usp.objects.page import ( SitemapPage, - SitemapNewsStory, - SitemapPageChangeFrequency, ) + class TestTreeBasic(TreeTestBase): def test_sitemap_tree_for_homepage_plain_text(self, requests_mock): """Test sitemap_tree_for_homepage() with plain text sitemaps.""" diff --git a/tests/tree/test_robots.py b/tests/tree/test_robots.py index e7a894b..65cdb9e 100644 --- a/tests/tree/test_robots.py +++ b/tests/tree/test_robots.py @@ -1,26 +1,13 @@ -import difflib import textwrap -from tests.helpers import gzip from tests.tree.base import TreeTestBase from usp.tree import sitemap_tree_for_homepage from usp.objects.sitemap import ( IndexRobotsTxtSitemap, - PagesXMLSitemap, - IndexXMLSitemap, - InvalidSitemap, - PagesTextSitemap, IndexWebsiteSitemap, - PagesRSSSitemap, - PagesAtomSitemap, ) -from usp.objects.page import ( - SitemapPage, - SitemapNewsStory, - SitemapPageChangeFrequency, -) class TestTreeRobots(TreeTestBase): def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self, requests_mock): diff --git a/tests/tree/test_rss_atom.py b/tests/tree/test_rss_atom.py index 49a4885..05f71dd 100644 --- a/tests/tree/test_rss_atom.py +++ b/tests/tree/test_rss_atom.py @@ -1,16 +1,11 @@ import difflib import textwrap -from tests.helpers import gzip from tests.tree.base import TreeTestBase from usp.tree import sitemap_tree_for_homepage from usp.objects.sitemap import ( IndexRobotsTxtSitemap, - PagesXMLSitemap, - IndexXMLSitemap, - InvalidSitemap, - PagesTextSitemap, IndexWebsiteSitemap, PagesRSSSitemap, PagesAtomSitemap, @@ -19,9 +14,9 @@ from usp.objects.page import ( SitemapPage, SitemapNewsStory, - SitemapPageChangeFrequency, ) + class TestTreeBasic(TreeTestBase): def test_sitemap_tree_for_homepage_rss_atom(self, requests_mock): """Test sitemap_tree_for_homepage() with RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" diff --git a/tests/tree/test_xml.py b/tests/tree/test_xml.py index 9a28956..0aee09e 100644 --- a/tests/tree/test_xml.py +++ b/tests/tree/test_xml.py @@ -1,27 +1,19 @@ -import difflib import textwrap -from tests.helpers import gzip from tests.tree.base import TreeTestBase from usp.tree import sitemap_tree_for_homepage from usp.objects.sitemap import ( IndexRobotsTxtSitemap, PagesXMLSitemap, - IndexXMLSitemap, - InvalidSitemap, - PagesTextSitemap, IndexWebsiteSitemap, - PagesRSSSitemap, - PagesAtomSitemap, ) from usp.objects.page import ( SitemapPage, - SitemapNewsStory, - SitemapPageChangeFrequency, ) + class TestTreeXML(TreeTestBase): def test_sitemap_tree_for_homepage_prematurely_ending_xml(self, requests_mock): """Test sitemap_tree_for_homepage() with clipped XML. From 31dc767b0dce0653cc9663fda1af31d9687589f2 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Mon, 2 Sep 2024 17:23:03 +0100 Subject: [PATCH 43/79] Dict and pickle serialisation --- tests/tree/base.py | 216 +++++++++++++++++++++++++++++++++++++++ tests/tree/test_basic.py | 215 +------------------------------------- tests/tree/test_save.py | 143 ++++++++++++++++++++++++++ usp/objects/page.py | 27 +++++ usp/objects/sitemap.py | 82 ++++++++++++++- 5 files changed, 468 insertions(+), 215 deletions(-) create mode 100644 tests/tree/test_save.py diff --git a/tests/tree/base.py b/tests/tree/base.py index ffe68f7..e4f6f6b 100644 --- a/tests/tree/base.py +++ b/tests/tree/base.py @@ -1,5 +1,6 @@ import datetime from email.utils import format_datetime +import textwrap from dateutil.tz import tzoffset import requests_mock as rq_mock @@ -36,3 +37,218 @@ def fallback_to_404_not_found_matcher(request): headers={"Content-Type": "text/html"}, text="

404 Not Found!

", ) + + def init_basic_sitemap(self, requests_mock): + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/", + text="This is a homepage.", + ) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_pages.xml + + # Intentionally spelled as "Site-map" as Google tolerates this: + # https://github.com/google/robotstxt/blob/master/robots.cc#L703 + Site-map: {self.TEST_BASE_URL}/sitemap_news_index_1.xml + """ + ).strip(), + ) + + # One sitemap for random static pages + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_pages.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/about.html + {self.TEST_DATE_STR_ISO8601} + monthly + 0.8 + + + {self.TEST_BASE_URL}/contact.html + {self.TEST_DATE_STR_ISO8601} + + + when we feel like it + + + 1.1 + + + + """ + ).strip(), + ) + + # Index sitemap pointing to sitemaps with stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_index_1.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/sitemap_news_1.xml + {self.TEST_DATE_STR_ISO8601} + + + {self.TEST_BASE_URL}/sitemap_news_index_2.xml + {self.TEST_DATE_STR_ISO8601} + + + """ + ).strip(), + ) + + # First sitemap with actual stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_1.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + + {self.TEST_BASE_URL}/news/foo.html + + + + + + + + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + Foo <foo> + + + + + + {self.TEST_BASE_URL}/news/bar.html + + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + Bar & bar + + + + + """ + ).strip(), + ) + + # Another index sitemap pointing to a second sitemaps with stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_index_2.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + + + {self.TEST_BASE_URL}/sitemap_news_2.xml + {self.TEST_DATE_STR_ISO8601} + + + + + {self.TEST_BASE_URL}/sitemap_news_missing.xml + {self.TEST_DATE_STR_ISO8601} + + + + """ + ).strip(), + ) + + # Second sitemap with actual stories + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_2.xml", + headers={"Content-Type": "application/xml"}, + text=textwrap.dedent( + f""" + + + + + + + {self.TEST_BASE_URL}/news/bar.html + + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + + + + Bar & bar + + + + + {self.TEST_BASE_URL}/news/baz.html + + + + {self.TEST_PUBLICATION_NAME} + {self.TEST_PUBLICATION_LANGUAGE} + + {self.TEST_DATE_STR_ISO8601} + + + + + + """ + ).strip(), + ) + + # Nonexistent sitemap + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_news_missing.xml", + status_code=404, + reason="Not Found", + headers={"Content-Type": "text/html"}, + text="

404 Not Found!

", + ) diff --git a/tests/tree/test_basic.py b/tests/tree/test_basic.py index 68a7d03..61902a1 100644 --- a/tests/tree/test_basic.py +++ b/tests/tree/test_basic.py @@ -25,220 +25,7 @@ class TestTreeBasic(TreeTestBase): def test_sitemap_tree_for_homepage(self, requests_mock): """Test sitemap_tree_for_homepage().""" - - requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) - - requests_mock.get( - self.TEST_BASE_URL + "/", - text="This is a homepage.", - ) - - requests_mock.get( - self.TEST_BASE_URL + "/robots.txt", - headers={"Content-Type": "text/plain"}, - text=textwrap.dedent( - f""" - User-agent: * - Disallow: /whatever - - Sitemap: {self.TEST_BASE_URL}/sitemap_pages.xml - - # Intentionally spelled as "Site-map" as Google tolerates this: - # https://github.com/google/robotstxt/blob/master/robots.cc#L703 - Site-map: {self.TEST_BASE_URL}/sitemap_news_index_1.xml - """ - ).strip(), - ) - - # One sitemap for random static pages - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_pages.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/about.html - {self.TEST_DATE_STR_ISO8601} - monthly - 0.8 - - - {self.TEST_BASE_URL}/contact.html - {self.TEST_DATE_STR_ISO8601} - - - when we feel like it - - - 1.1 - - - - """ - ).strip(), - ) - - # Index sitemap pointing to sitemaps with stories - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_index_1.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - {self.TEST_BASE_URL}/sitemap_news_1.xml - {self.TEST_DATE_STR_ISO8601} - - - {self.TEST_BASE_URL}/sitemap_news_index_2.xml - {self.TEST_DATE_STR_ISO8601} - - - """ - ).strip(), - ) - - # First sitemap with actual stories - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_1.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - - {self.TEST_BASE_URL}/news/foo.html - - - - - - - - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - Foo <foo> - - - - - - {self.TEST_BASE_URL}/news/bar.html - - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - Bar & bar - - - - - """ - ).strip(), - ) - - # Another index sitemap pointing to a second sitemaps with stories - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_index_2.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - - - {self.TEST_BASE_URL}/sitemap_news_2.xml - {self.TEST_DATE_STR_ISO8601} - - - - - {self.TEST_BASE_URL}/sitemap_news_missing.xml - {self.TEST_DATE_STR_ISO8601} - - - - """ - ).strip(), - ) - - # Second sitemap with actual stories - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_2.xml", - headers={"Content-Type": "application/xml"}, - text=textwrap.dedent( - f""" - - - - - - - {self.TEST_BASE_URL}/news/bar.html - - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - - - - Bar & bar - - - - - {self.TEST_BASE_URL}/news/baz.html - - - - {self.TEST_PUBLICATION_NAME} - {self.TEST_PUBLICATION_LANGUAGE} - - {self.TEST_DATE_STR_ISO8601} - - - - - - """ - ).strip(), - ) - - # Nonexistent sitemap - requests_mock.get( - self.TEST_BASE_URL + "/sitemap_news_missing.xml", - status_code=404, - reason="Not Found", - headers={"Content-Type": "text/html"}, - text="

404 Not Found!

", - ) + self.init_basic_sitemap(requests_mock) expected_sitemap_tree = IndexWebsiteSitemap( url=f"{self.TEST_BASE_URL}/", diff --git a/tests/tree/test_save.py b/tests/tree/test_save.py new file mode 100644 index 0000000..1d78ae0 --- /dev/null +++ b/tests/tree/test_save.py @@ -0,0 +1,143 @@ +import datetime +from decimal import Decimal +import os +import pickle +from dateutil.tz import tzoffset +import pytest + +from tests.tree.base import TreeTestBase +from usp.tree import sitemap_tree_for_homepage + + +class TestTreeSave(TreeTestBase): + @pytest.fixture + def tree(self, requests_mock): + self.init_basic_sitemap(requests_mock) + + return sitemap_tree_for_homepage(self.TEST_BASE_URL) + + def test_pickle(self, tree, tmp_path): + with open(tmp_path / "sitemap.pickle", "wb") as f: + pickle.dump(tree, f) + + tree_all_pages = list(tree.all_pages()) + + # Delete the temp file without deleting the object + os.unlink( + tree.sub_sitemaps[0] + .sub_sitemaps[0] + ._AbstractPagesSitemap__pages_temp_file_path + ) + + with open(tmp_path / "sitemap.pickle", "rb") as f: + tree_loaded = pickle.load(f) + + assert tree_all_pages == list(tree_loaded.all_pages()) + assert len(list(tree_loaded.all_sitemaps())) == 7 + + def test_tree_to_dict(self, tree): + tree_d = tree.to_dict() + + assert len(tree_d["sub_sitemaps"][0]["sub_sitemaps"][0]["pages"]) == 2 + assert "pages" not in tree_d["sub_sitemaps"][0], "index sitemap has pages key" + assert ( + "sub_sitemaps" not in tree_d["sub_sitemaps"][0]["sub_sitemaps"][0] + ), "page sitemap has sub_sitemaps key" + + def test_page_to_dict(self, tree, tmp_path): + pages = list(tree.all_pages()) + + pages_d = [page.to_dict() for page in pages] + + assert pages_d == [ + { + "url": "http://test_ultimate-sitemap-parser.com/about.html", + "priority": Decimal("0.8"), + "last_modified": datetime.datetime( + 2009, 12, 17, 12, 4, 56, tzinfo=tzoffset(None, 7200) + ), + "change_frequency": "monthly", + "news_story": None, + }, + { + "url": "http://test_ultimate-sitemap-parser.com/contact.html", + "priority": Decimal("0.5"), + "last_modified": datetime.datetime( + 2009, 12, 17, 12, 4, 56, tzinfo=tzoffset(None, 7200) + ), + "change_frequency": "always", + "news_story": None, + }, + { + "url": "http://test_ultimate-sitemap-parser.com/news/foo.html", + "priority": Decimal("0.5"), + "last_modified": None, + "change_frequency": None, + "news_story": { + "title": "Foo ", + "publish_date": datetime.datetime( + 2009, 12, 17, 12, 4, 56, tzinfo=tzoffset(None, 7200) + ), + "publication_name": "Test publication", + "publication_language": "en", + "access": None, + "genres": [], + "keywords": [], + "stock_tickers": [], + }, + }, + { + "url": "http://test_ultimate-sitemap-parser.com/news/bar.html", + "priority": Decimal("0.5"), + "last_modified": None, + "change_frequency": None, + "news_story": { + "title": "Bar & bar", + "publish_date": datetime.datetime( + 2009, 12, 17, 12, 4, 56, tzinfo=tzoffset(None, 7200) + ), + "publication_name": "Test publication", + "publication_language": "en", + "access": None, + "genres": [], + "keywords": [], + "stock_tickers": [], + }, + }, + { + "url": "http://test_ultimate-sitemap-parser.com/news/bar.html", + "priority": Decimal("0.5"), + "last_modified": None, + "change_frequency": None, + "news_story": { + "title": "Bar & bar", + "publish_date": datetime.datetime( + 2009, 12, 17, 12, 4, 56, tzinfo=tzoffset(None, 7200) + ), + "publication_name": "Test publication", + "publication_language": "en", + "access": None, + "genres": [], + "keywords": [], + "stock_tickers": [], + }, + }, + { + "url": "http://test_ultimate-sitemap-parser.com/news/baz.html", + "priority": Decimal("0.5"), + "last_modified": None, + "change_frequency": None, + "news_story": { + "title": "Bąž", + "publish_date": datetime.datetime( + 2009, 12, 17, 12, 4, 56, tzinfo=tzoffset(None, 7200) + ), + "publication_name": "Test publication", + "publication_language": "en", + "access": None, + "genres": [], + "keywords": [], + "stock_tickers": [], + }, + }, + ] diff --git a/usp/objects/page.py b/usp/objects/page.py index e1451cc..12e12c8 100644 --- a/usp/objects/page.py +++ b/usp/objects/page.py @@ -91,6 +91,18 @@ def __eq__(self, other) -> bool: return True + def to_dict(self): + return { + "title": self.title, + "publish_date": self.publish_date, + "publication_name": self.publication_name, + "publication_language": self.publication_language, + "access": self.access, + "genres": self.genres, + "keywords": self.keywords, + "stock_tickers": self.stock_tickers, + } + def __hash__(self): return hash( ( @@ -290,6 +302,21 @@ def __repr__(self): ")" ) + def to_dict(self): + """ + Convert this page to a dictionary. + """ + + return { + "url": self.url, + "priority": self.priority, + "last_modified": self.last_modified, + "change_frequency": self.change_frequency.value + if self.change_frequency + else None, + "news_story": self.news_story.to_dict() if self.news_story else None, + } + @property def url(self) -> str: """ diff --git a/usp/objects/sitemap.py b/usp/objects/sitemap.py index 3a2be4d..95ed468 100644 --- a/usp/objects/sitemap.py +++ b/usp/objects/sitemap.py @@ -9,6 +9,7 @@ """ import abc +from functools import cache import os import pickle import tempfile @@ -17,6 +18,29 @@ from .page import SitemapPage +@cache +def _all_slots(target_cls): + mro = target_cls.__mro__ + + # If a child class doesn't declare slots, getattr reports its parents' slots + # So we need to track the highest class that declared each slot + last_slot = {} + + for cls in mro: + attrs = getattr(cls, "__slots__", tuple()) + for attr in attrs: + last_slot[attr] = cls + + slots = set() + for attr, cls in last_slot.items(): + # Attrs belonging to parent classes may be mangled + if cls is not target_cls and attr.startswith("__"): + attr = "_" + cls.__name__ + attr + slots.add(attr) + + return slots + + class AbstractSitemap(metaclass=abc.ABCMeta): """ Abstract sitemap. @@ -58,6 +82,18 @@ def url(self) -> str: """ return self.__url + def to_dict(self, with_pages=True) -> dict: + """ + Return a dictionary representation of the sitemap, including its child sitemaps and optionally pages + + :param with_pages: Include pages in the representation of this sitemap or descendants. + :return: Dictionary representation of the sitemap. + """ + + return { + "url": self.url, + } + @property @abc.abstractmethod def pages(self) -> List[SitemapPage]: @@ -137,6 +173,12 @@ def __repr__(self): ")" ) + def to_dict(self, with_pages=True) -> dict: + return { + **super().to_dict(with_pages=with_pages), + "reason": self.reason, + } + @property def reason(self) -> str: """ @@ -181,8 +223,11 @@ def __init__(self, url: str, pages: List[SitemapPage]): """ super().__init__(url=url) + self._dump_pages(pages) + + def _dump_pages(self, pages: List[SitemapPage]): temp_file, self.__pages_temp_file_path = tempfile.mkstemp() - with os.fdopen(temp_file, "wb") as tmp: + with open(self.__pages_temp_file_path, "wb") as tmp: pickle.dump(pages, tmp, protocol=pickle.HIGHEST_PROTOCOL) def __del__(self): @@ -205,6 +250,32 @@ def __repr__(self): f"{self.__class__.__name__}(" f"url={self.url}, " f"pages={self.pages}" ")" ) + def __getstate__(self) -> tuple[None, dict]: + # Load default slots + obj_slots = {slot: getattr(self, slot) for slot in _all_slots(self.__class__)} + del obj_slots["_AbstractPagesSitemap__pages_temp_file_path"] + obj_slots["_pages_value"] = self.pages + return None, obj_slots + + def __setstate__(self, state: tuple): + _, attrs = state + if "_pages_value" not in attrs: + raise ValueError("State does not contain pages value") + pages_val = attrs.pop("_pages_value") + for slot, val in attrs.items(): + setattr(self, slot, val) + self._dump_pages(pages_val) + + def to_dict(self, with_pages=True) -> dict: + obj = { + **super().to_dict(with_pages=with_pages), + } + + if with_pages: + obj["pages"] = [page.to_dict() for page in self.pages] + + return obj + @property def pages(self) -> List[SitemapPage]: """ @@ -297,6 +368,15 @@ def __repr__(self): ")" ) + def to_dict(self, with_pages=True) -> dict: + return { + **super().to_dict(with_pages=with_pages), + "sub_sitemaps": [ + sub_sitemap.to_dict(with_pages=with_pages) + for sub_sitemap in self.sub_sitemaps + ], + } + @property def sub_sitemaps(self) -> List["AbstractSitemap"]: return self.__sub_sitemaps From 3ebbe68a59e26fb966979ed12b060d584fa8430a Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 3 Sep 2024 10:58:03 +0100 Subject: [PATCH 44/79] Improve in-code docs --- usp/exceptions.py | 6 ++- usp/fetch_parse.py | 63 +++++++++++++++++++++++++++++-- usp/helpers.py | 3 ++ usp/objects/page.py | 7 +++- usp/objects/sitemap.py | 6 ++- usp/tree.py | 3 +- usp/web_client/abstract_client.py | 2 +- usp/web_client/requests_client.py | 20 +++++++--- 8 files changed, 93 insertions(+), 17 deletions(-) diff --git a/usp/exceptions.py b/usp/exceptions.py index 88546cf..014466f 100644 --- a/usp/exceptions.py +++ b/usp/exceptions.py @@ -19,7 +19,8 @@ class SitemapXMLParsingException(Exception): class GunzipException(Exception): """ - gunzip() exception. + Error decompressing seemingly gzipped content. + See :func:`usp.helpers.gunzip`. """ pass @@ -27,7 +28,8 @@ class GunzipException(Exception): class StripURLToHomepageException(Exception): """ - strip_url_to_homepage() exception. + Problem parsing URL and stripping to homepage. + See :func:`usp.helpers.strip_url_to_homepage`. """ pass diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index a0fdeea..8965e03 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -51,7 +51,9 @@ class SitemapFetcher: - """robots.txt / XML / plain text sitemap fetcher.""" + """ + Fetches and parses the sitemap at a given URL, and any declared sub-sitemaps. + """ __MAX_SITEMAP_SIZE = 100 * 1024 * 1024 """Max. uncompressed sitemap size. @@ -73,6 +75,15 @@ def __init__( recursion_level: int, web_client: Optional[AbstractWebClient] = None, ): + """ + + :param url: URL of the sitemap to fetch and parse. + :param recursion_level: current recursion level of parser + :param web_client: Web client to use. If ``None``, a :class:`~.RequestsWebClient` will be used. + + :raises SitemapException: If the maximum recursion depth is exceeded. + :raises SitemapException: If the URL is not an HTTP(S) URL + """ if recursion_level > self.__MAX_RECURSION_LEVEL: raise SitemapException( f"Recursion level exceeded {self.__MAX_RECURSION_LEVEL} for URL {url}." @@ -91,6 +102,12 @@ def __init__( self._recursion_level = recursion_level def sitemap(self) -> AbstractSitemap: + """ + Fetch and parse the sitemap. + + :return: the parsed sitemap. Will be a child of :class:`~.AbstractSitemap`. + If an HTTP error is encountered, or the sitemap cannot be parsed, will be :class:`~.InvalidSitemap`. + """ log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...") response = get_url_retry_on_client_errors( url=self._url, web_client=self._web_client @@ -163,6 +180,11 @@ def __init__( @abc.abstractmethod def sitemap(self) -> AbstractSitemap: + """ + Create the parsed sitemap instance and perform any sub-parsing needed. + + :return: an instance of the appropriate sitemap class + """ raise NotImplementedError("Abstract method.") @@ -255,7 +277,11 @@ def sitemap(self) -> AbstractSitemap: class XMLSitemapParser(AbstractSitemapParser): - """XML sitemap parser.""" + """Initial XML sitemap parser. + + Instantiates an Expat parser and registers handler methods, which determine the specific format + and instantiates a concrete parser (inheriting from :class:`AbstractXMLSitemapParser`) to extract data. + """ __XML_NAMESPACE_SEPARATOR = " " @@ -417,17 +443,39 @@ def __init__(self, url: str): self._last_handler_call_was_xml_char_data = False def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: + """Concrete parser handler when the start of an element is encountered. + + See :external+python:meth:`xmlparser.StartElementHandler ` + + :param name: element name, potentially prefixed with namespace + :param attrs: element attributes + """ self._last_handler_call_was_xml_char_data = False pass def xml_element_end(self, name: str) -> None: + """Concrete parser handler when the end of an element is encountered. + + See :external+python:meth:`xmlparser.EndElementHandler ` + + :param name: element name, potentially prefixed with namespace + """ # End of any element always resets last encountered character data self._last_char_data = "" self._last_handler_call_was_xml_char_data = False def xml_char_data(self, data: str) -> None: - # Handler might be called multiple times for what essentially is a single string, e.g. in case of entities - # ("ABC & DEF"), so this is why we're appending + """ + Concrete parser handler for character data. + + Multiple concurrent calls are concatenated until an XML element start or end is reached, + as it may be called multiple times for a single string. + E.g. ``ABC & DEF``. + + See :external+python:meth:`xmlparser.CharacterDataHandler ` + + :param data: string data + """ if self._last_handler_call_was_xml_char_data: self._last_char_data += data else: @@ -437,6 +485,11 @@ def xml_char_data(self, data: str) -> None: @abc.abstractmethod def sitemap(self) -> AbstractSitemap: + """ + Create the parsed sitemap instance and perform any sub-parsing needed. + + :return: an instance of the appropriate sitemap class + """ raise NotImplementedError("Abstract method.") @@ -870,6 +923,8 @@ class PagesAtomSitemapParser(AbstractXMLSitemapParser): """ Pages Atom 0.3 / 1.0 sitemap parser. + References: + - https://github.com/simplepie/simplepie-ng/wiki/Spec:-Atom-0.3 - https://www.ietf.org/rfc/rfc4287.txt - http://rakaz.nl/2005/07/moving-from-atom-03-to-10.html diff --git a/usp/helpers.py b/usp/helpers.py index fcf0b22..6a610de 100644 --- a/usp/helpers.py +++ b/usp/helpers.py @@ -196,6 +196,7 @@ def gunzip(data: bytes) -> bytes: """ Gunzip data. + :raises GunzipException: If the data cannot be decompressed. :param data: Gzipped data. :return: Gunzipped data. """ @@ -259,6 +260,8 @@ def strip_url_to_homepage(url: str) -> str: """ Strip URL to its homepage. + :raises StripURLToHomepageException: If URL is empty or cannot be parsed. + :param url: URL to strip, e.g. "http://www.example.com/page.html". :return: Stripped homepage URL, e.g. "http://www.example.com/" """ diff --git a/usp/objects/page.py b/usp/objects/page.py index 12e12c8..4fb0f67 100644 --- a/usp/objects/page.py +++ b/usp/objects/page.py @@ -91,7 +91,12 @@ def __eq__(self, other) -> bool: return True - def to_dict(self): + def to_dict(self) -> dict: + """ + Convert to a dictionary representation. + + :return: the news story data as a dictionary + """ return { "title": self.title, "publish_date": self.publish_date, diff --git a/usp/objects/sitemap.py b/usp/objects/sitemap.py index 95ed468..fd679c3 100644 --- a/usp/objects/sitemap.py +++ b/usp/objects/sitemap.py @@ -251,14 +251,16 @@ def __repr__(self): ) def __getstate__(self) -> tuple[None, dict]: - # Load default slots + # Load slots of this class and its parents (mangling if appropriate) obj_slots = {slot: getattr(self, slot) for slot in _all_slots(self.__class__)} + # Replace temp file path with actual content del obj_slots["_AbstractPagesSitemap__pages_temp_file_path"] obj_slots["_pages_value"] = self.pages return None, obj_slots def __setstate__(self, state: tuple): _, attrs = state + # We can't restore contents without this key if "_pages_value" not in attrs: raise ValueError("State does not contain pages value") pages_val = attrs.pop("_pages_value") @@ -296,7 +298,7 @@ def sub_sitemaps(self) -> List["AbstractSitemap"]: """ return [] - +# TODO: declare empty __slots__ class PagesXMLSitemap(AbstractPagesSitemap): """ XML sitemap that contains URLs to pages. diff --git a/usp/tree.py b/usp/tree.py index 92f39ce..2045e01 100644 --- a/usp/tree.py +++ b/usp/tree.py @@ -45,7 +45,8 @@ def sitemap_tree_for_homepage( Using a homepage URL, fetch the tree of sitemaps and pages listed in them. :param homepage_url: Homepage URL of a website to fetch the sitemap tree for, e.g. "http://www.example.com/". - :param web_client: Web client implementation to use for fetching sitemaps. + :param web_client: Custom web client implementation to use when fetching sitemaps. + If ``None``, a :class:`~.RequestsWebClient` will be used. :param use_robots: Whether to discover sitemaps through robots.txt. :param use_known_paths: Whether to discover sitemaps through common known paths. :return: Root sitemap object of the fetched sitemap tree. diff --git a/usp/web_client/abstract_client.py b/usp/web_client/abstract_client.py index 54299c1..9b2cc7e 100644 --- a/usp/web_client/abstract_client.py +++ b/usp/web_client/abstract_client.py @@ -155,7 +155,7 @@ def set_max_response_data_length(self, max_response_data_length: int) -> None: @abc.abstractmethod def get(self, url: str) -> AbstractWebClientResponse: """ - Fetch an URL and return a response. + Fetch a URL and return a response. Method shouldn't throw exceptions on connection errors (including timeouts); instead, such errors should be reported via Response object. diff --git a/usp/web_client/requests_client.py b/usp/web_client/requests_client.py index c0b3696..9a9e97d 100644 --- a/usp/web_client/requests_client.py +++ b/usp/web_client/requests_client.py @@ -1,4 +1,4 @@ -"""requests-based implementation of web client class.""" +"""Implementation of :mod:`usp.web_client.abstract_client` with Requests.""" from http import HTTPStatus from typing import Optional, Dict @@ -30,6 +30,10 @@ def __init__( requests_response: requests.Response, max_response_data_length: Optional[int] = None, ): + """ + :param requests_response: Response data + :param max_response_data_length: Maximum data length, or ``None`` to not restrict. + """ self.__requests_response = requests_response self.__max_response_data_length = max_response_data_length @@ -56,7 +60,7 @@ def raw_data(self) -> bytes: class RequestsWebClientErrorResponse(WebClientErrorResponse): """ - requests-based error response. + Error response from the Requests parser. """ pass @@ -78,9 +82,13 @@ class RequestsWebClient(AbstractWebClient): "__max_response_data_length", "__timeout", "__proxies", + "__verify" ] def __init__(self, verify=True): + """ + :param verify: whether certificates should be verified for HTTPS requests. + """ self.__max_response_data_length = None self.__timeout = self.__HTTP_REQUEST_TIMEOUT self.__proxies = {} @@ -93,19 +101,19 @@ def set_timeout(self, timeout: int) -> None: def set_proxies(self, proxies: Dict[str, str]) -> None: """ - Set proxies from dictionnary where: + Set a proxy for the request. * keys are schemes, e.g. "http" or "https"; * values are "scheme://user:password@host:port/". - For example: - - proxies = {'http': 'http://user:pass@10.10.1.10:3128/'} + :param proxies: Proxy definition where the keys are schemes ("http" or "https") and values are the proxy address. + Example: ``{'http': 'http://user:pass@10.10.1.10:3128/'}`` """ # Used mostly for testing self.__proxies = proxies def set_max_response_data_length(self, max_response_data_length: int) -> None: + """Set max response data length.""" self.__max_response_data_length = max_response_data_length def get(self, url: str) -> AbstractWebClientResponse: From 97978b3432628894d3b6db4340e728b9b35c02a2 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 3 Sep 2024 10:58:54 +0100 Subject: [PATCH 45/79] Ruff --- usp/objects/sitemap.py | 1 + usp/web_client/requests_client.py | 7 +------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/usp/objects/sitemap.py b/usp/objects/sitemap.py index fd679c3..97cd038 100644 --- a/usp/objects/sitemap.py +++ b/usp/objects/sitemap.py @@ -298,6 +298,7 @@ def sub_sitemaps(self) -> List["AbstractSitemap"]: """ return [] + # TODO: declare empty __slots__ class PagesXMLSitemap(AbstractPagesSitemap): """ diff --git a/usp/web_client/requests_client.py b/usp/web_client/requests_client.py index 9a9e97d..5a9121c 100644 --- a/usp/web_client/requests_client.py +++ b/usp/web_client/requests_client.py @@ -78,12 +78,7 @@ class RequestsWebClient(AbstractWebClient): Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big. """ - __slots__ = [ - "__max_response_data_length", - "__timeout", - "__proxies", - "__verify" - ] + __slots__ = ["__max_response_data_length", "__timeout", "__proxies", "__verify"] def __init__(self, verify=True): """ From 009cb37aca4c7cf081c2b5e3ffa4bbf4c1b28487 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 3 Sep 2024 12:06:57 +0100 Subject: [PATCH 46/79] Enhanced docs --- docs/Makefile | 14 +- docs/_static/css/custom.css | 82 ++ docs/acknowledgements.rst | 11 + docs/changelog.rst | 28 + docs/conf.py | 205 ++--- docs/extensions/__init__.py | 0 docs/extensions/custom_graphviz.py | 613 +++++++++++++ docs/get-started.rst | 38 + docs/guides/_sitemap_examples/atom0.3.xml | 18 + docs/guides/_sitemap_examples/atom1.0.xml | 18 + docs/guides/_sitemap_examples/bbc-sitemap.dot | 365 ++++++++ docs/guides/_sitemap_examples/class-tree.dot | 4 + docs/guides/_sitemap_examples/google-news.xml | 29 + docs/guides/_sitemap_examples/plaintext.txt | 4 + docs/guides/_sitemap_examples/rss2.0.xml | 18 + .../guides/_sitemap_examples/simple-index.xml | 11 + .../_sitemap_examples/simple-urlset.xml | 11 + docs/guides/fetch-parse.rst | 56 ++ docs/guides/parse_flow.dot | 142 +++ docs/guides/performance.rst | 55 ++ docs/guides/saving.rst | 75 ++ docs/guides/security.rst | 20 + docs/guides/sitemap-tree.rst | 170 ++++ docs/index.rst | 92 +- docs/modules.rst | 7 - docs/reference/api/index.rst | 13 + docs/reference/api/usp.exceptions.rst | 5 + docs/reference/api/usp.fetch_parse.rst | 42 + docs/reference/api/usp.helpers.rst | 5 + docs/reference/api/usp.objects.page.rst | 14 + docs/reference/api/usp.objects.rst | 8 + docs/reference/api/usp.objects.sitemap.rst | 57 ++ docs/reference/api/usp.tree.rst | 7 + .../api/usp.web_client.abstract_client.rst | 21 + .../api/usp.web_client.requests_client.rst | 19 + docs/reference/api/usp.web_client.rst | 6 + docs/reference/cli.rst | 69 ++ docs/reference/formats.rst | 227 +++++ docs/reference/formats_examples/atom0.3.xml | 18 + docs/reference/formats_examples/atom1.0.xml | 18 + .../formats_examples/google-news.xml | 28 + docs/reference/formats_examples/plaintext.txt | 4 + docs/reference/formats_examples/robots.txt | 13 + docs/reference/formats_examples/rss2.0.xml | 18 + .../formats_examples/simple-index.xml | 11 + .../formats_examples/simple-urlset.xml | 11 + docs/usp.objects.rst | 30 - docs/usp.rst | 38 - docs/usp.web_client.rst | 30 - poetry.lock | 868 +++++++++++++++--- pyproject.toml | 10 + 51 files changed, 3285 insertions(+), 391 deletions(-) create mode 100644 docs/_static/css/custom.css create mode 100644 docs/acknowledgements.rst create mode 100644 docs/changelog.rst create mode 100644 docs/extensions/__init__.py create mode 100644 docs/extensions/custom_graphviz.py create mode 100644 docs/get-started.rst create mode 100644 docs/guides/_sitemap_examples/atom0.3.xml create mode 100644 docs/guides/_sitemap_examples/atom1.0.xml create mode 100644 docs/guides/_sitemap_examples/bbc-sitemap.dot create mode 100644 docs/guides/_sitemap_examples/class-tree.dot create mode 100644 docs/guides/_sitemap_examples/google-news.xml create mode 100644 docs/guides/_sitemap_examples/plaintext.txt create mode 100644 docs/guides/_sitemap_examples/rss2.0.xml create mode 100644 docs/guides/_sitemap_examples/simple-index.xml create mode 100644 docs/guides/_sitemap_examples/simple-urlset.xml create mode 100644 docs/guides/fetch-parse.rst create mode 100644 docs/guides/parse_flow.dot create mode 100644 docs/guides/performance.rst create mode 100644 docs/guides/saving.rst create mode 100644 docs/guides/security.rst create mode 100644 docs/guides/sitemap-tree.rst delete mode 100644 docs/modules.rst create mode 100644 docs/reference/api/index.rst create mode 100644 docs/reference/api/usp.exceptions.rst create mode 100644 docs/reference/api/usp.fetch_parse.rst create mode 100644 docs/reference/api/usp.helpers.rst create mode 100644 docs/reference/api/usp.objects.page.rst create mode 100644 docs/reference/api/usp.objects.rst create mode 100644 docs/reference/api/usp.objects.sitemap.rst create mode 100644 docs/reference/api/usp.tree.rst create mode 100644 docs/reference/api/usp.web_client.abstract_client.rst create mode 100644 docs/reference/api/usp.web_client.requests_client.rst create mode 100644 docs/reference/api/usp.web_client.rst create mode 100644 docs/reference/cli.rst create mode 100644 docs/reference/formats.rst create mode 100644 docs/reference/formats_examples/atom0.3.xml create mode 100644 docs/reference/formats_examples/atom1.0.xml create mode 100644 docs/reference/formats_examples/google-news.xml create mode 100644 docs/reference/formats_examples/plaintext.txt create mode 100644 docs/reference/formats_examples/robots.txt create mode 100644 docs/reference/formats_examples/rss2.0.xml create mode 100644 docs/reference/formats_examples/simple-index.xml create mode 100644 docs/reference/formats_examples/simple-urlset.xml delete mode 100644 docs/usp.objects.rst delete mode 100644 docs/usp.rst delete mode 100644 docs/usp.web_client.rst diff --git a/docs/Makefile b/docs/Makefile index 298ea9e..4b3e11b 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,12 +1,17 @@ # Minimal makefile for Sphinx documentation # -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build +.PHONY: livehtml +livehtml: + sphinx-autobuild -a "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) @@ -16,4 +21,5 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css new file mode 100644 index 0000000..cef1a5b --- /dev/null +++ b/docs/_static/css/custom.css @@ -0,0 +1,82 @@ +/* Fix graphviz width */ +svg.graphviz { + max-width: 100% !important; + height: auto !important; +} + +svg.graphviz .node a { + text-decoration: none; + fill: var(--color-link); +} + +svg.graphviz .node a:hover { + fill: var(--pst-color-link-hover); + text-decoration: underline; +} + +/* Make Rubric more like sphinx-book-theme */ + +p.rubric { + font-size: 1em; + border-bottom: 1px solid var(--color-background-border); + margin-top: 30px; + line-height: unset; +} + +/* Flush code blocks in cards */ +.code-card .sd-card-body { + padding: 0; + border-radius: 0 0 0.25rem 0.25rem; + background-color: #f0f0f0 !important; +} + +.code-card .sd-card-body>div { + margin: 0; + border-radius: 0 0 0.25rem 0.25rem; +} + +.code-card .highlight { + border-radius: 0 0 0.25rem 0.25rem; + background: unset !important; +} + +.code-card .highlight pre { + border-top-left-radius: 0; + border-top-right-radius: 0; + border: none; + background: unset; +} + +/* Flush code blocks in dropdowns */ + +details.flush { + --sd-fontsize-tabs-label: 0.9rem; +} + +details.flush .sd-summary-content { + padding: 0; +} + +details.flush .sd-summary-content > div[class^="highlight-"] { + margin: 0; +} + +details.flush .highlight { + border-radius: 0 0 0.25rem 0.25rem; +} + +details.flush .sd-tab-set { + margin-bottom: 0; +} + +details.flush .sd-tab-content { + padding: 0; +} + +details.flush .sd-tab-label { + padding-top: 0.5em; +} + +.extlink-commit { + font-family: var(--font-stack--monospace); +} \ No newline at end of file diff --git a/docs/acknowledgements.rst b/docs/acknowledgements.rst new file mode 100644 index 0000000..cb39e40 --- /dev/null +++ b/docs/acknowledgements.rst @@ -0,0 +1,11 @@ +Acknowledgements +================ + +Ultimate Sitemap Parser was originally developed by `Linus Valiukas `_ and `Thomas Grandjean `_ as part of the `Media Cloud `_ project, incubated at the Berkman Klein Center for Internet and Society at Harvard University and the MIT Media Lab in the Centre for Civic Media. + +It is now maintained by the `GATE Team `_ at the School of Computer Science, University of Sheffield, chiefly by `Freddy Heppell `_. Development is partially supported by: + +- InnovateUK grant number 10039039 (approved under the Horizon Europe Programme as VIGILANT, EU grant agreement number 101073921) +- A University of Sheffield Faculty of Engineering PGR Prize Scholarship + +Other contributors are listed in the `GitHub contributors list `_. \ No newline at end of file diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000..f56ca85 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1,28 @@ +Changelog +========= + +v1.0.0 (upcoming) +----------------- + +- TODO + +v0.6 (upcoming) +--------------- + +- Add proxy support with :meth:`.RequestsWebClient.set_proxies` (:pr:`20` by :user:`tgrandje`) +- Add additional sitemap discovery paths for news sitemaps (:commit:`d3bdaae56be87c97ce2f3f845087f495f6439b44`) +- Resolve warnings caused by :external+python:class:`http.HTTPStatus` usage (:commit:`3867b6e`) +- Don't add :class:`~.InvalidSitemap` object if ``robots.txt`` is not found (:pr:`39` by :user:`gbenson`) +- Add parameter to :meth:`~.RequestsWebClient.__init__` to disable certificate verification (:pr:`37` by :user:`japherwocky`) +- Remove log configuration so it can be specified at application level (:pr:`24` by :user:`dsoprea`) + + +Prior versions +-------------- + +For versions prior to 1.0, no changelog is available. Use the release tags to compare versions: + +- `0.4...0.5 `__ +- `0.3...0.4 `__ +- `0.2...0.3 `__ +- `0.1...0.2 `__ diff --git a/docs/conf.py b/docs/conf.py index 9be76b9..c46cff7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,186 +1,91 @@ -# -*- coding: utf-8 -*- -# # Configuration file for the Sphinx documentation builder. # -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html -from pathlib import Path -project_path = Path(__file__).absolute().parent.joinpath('..') +import sys, os -import sys -sys.path.insert(0, project_path.as_posix()) +sys.path.append(os.path.abspath('extensions')) -from usp.__about__ import __version__ # -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = 'Ultimate Sitemap Parser' -copyright = '2018, Linas Valiukas, Hal Roberts, Media Cloud project' -author = 'Linas Valiukas, Hal Roberts, Media Cloud project' -# The short X.Y version -version = __version__ -# The full version, including alpha/beta/rc tags -release = version +project = 'Ultimate Sitemap Parser' +copyright = '2018-2024, Ultimate Sitemap Parser Contributors' +author = 'Ultimate Sitemap Parser Contributors' +release = '0.5.0' # -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. extensions = [ 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.viewcode', + 'sphinx.ext.autosummary', + 'sphinx.ext.inheritance_diagram', + 'sphinx.ext.intersphinx', + 'sphinx.ext.coverage', + 'sphinx.ext.extlinks', + 'sphinx_design', + 'sphinxext.opengraph', + 'sphinx_copybutton', + 'custom_graphviz', + ] -# Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = None +modindex_common_prefix = ['usp.'] +nitpicky = True # -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'alabaster' +_gh_root = 'https://github.com/GateNLP/ultimate-sitemap-parser' -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". +html_theme = 'furo' html_static_path = ['_static'] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = 'UltimateSitemapParserdoc' - - -# -- Options for LaTeX output ------------------------------------------------ - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'UltimateSitemapParser.tex', 'Ultimate Sitemap Parser Documentation', - 'Linas Valiukas, Hal Roberts, Media Cloud project', 'manual'), -] - - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'ultimatesitemapparser', 'Ultimate Sitemap Parser Documentation', - [author], 1) +html_title = 'Ultimate Sitemap Parser' +html_css_files = [ + 'css/custom.css', ] +html_theme_options = { + 'source_repository': _gh_root, + 'source_branch': 'master', + 'source_directory': 'docs/' +} +# -- Extension Config -------------------------------------------------------- -# -- Options for Texinfo output ---------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'UltimateSitemapParser', 'Ultimate Sitemap Parser Documentation', - author, 'UltimateSitemapParser', 'One line description of project.', - 'Miscellaneous'), -] +autodoc_class_signature = 'separated' +autodoc_member_order = 'groupwise' +extlinks = { + 'issue': (f'{_gh_root}/issues/%s', '#%s'), + 'pr': (f'{_gh_root}/pull/%s', '#%s'), + 'user': (f'https://github.com/%s', '@%s'), + 'commit': (f'{_gh_root}/commit/%s', '%.7s'), +} -# -- Options for Epub output ------------------------------------------------- - -# Bibliographic Dublin Core info. -epub_title = project +graphviz_output_format = 'svg' -# The unique identifier of the text. This can be a ISBN number -# or the project homepage. -# -# epub_identifier = '' +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'dateutil': ('https://dateutil.readthedocs.io/en/stable', None), + 'requests': ('https://requests.readthedocs.io/en/latest', None) +} -# A unique identification for the text. -# -# epub_uid = '' +autosectionlabel_prefix_document = True +autosectionlabel_maxdepth = 3 -# A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +pygments_style = "friendly" +coverage_show_missing_items = True -# -- Extension configuration ------------------------------------------------- +copybutton_exclude = '.linenos' +copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +copybutton_prompt_is_regexp = True \ No newline at end of file diff --git a/docs/extensions/__init__.py b/docs/extensions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docs/extensions/custom_graphviz.py b/docs/extensions/custom_graphviz.py new file mode 100644 index 0000000..6f084fe --- /dev/null +++ b/docs/extensions/custom_graphviz.py @@ -0,0 +1,613 @@ +""" +Derived from sphinx_immaterial.graphviz +https://github.com/jbms/sphinx-immaterial/blob/3c8fe16a499407a9a9b71b7dd2133c559cdccf95/sphinx_immaterial/graphviz.py +and sphinx_immaterial.sphinx_utils +https://github.com/jbms/sphinx-immaterial/blob/main/sphinx_immaterial/sphinx_utils.py + +Copyright 2021 The Sphinx-Immaterial Authors +SPDX-License-Identifier: MIT + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished +to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +import html +import io +import os +import pathlib +import re +import subprocess +import tempfile +from typing import Optional, Any, Tuple, List, Dict, Type, Sequence, NamedTuple, Union +import xml.etree.ElementTree as ET + +import docutils.nodes +import sphinx.application +import sphinx.ext.graphviz +import sphinx.util.docutils +import sphinx.util.logging +from sphinx.writers.html import HTMLTranslator +from sphinx.writers.html5 import HTML5Translator +import docutils.parsers.rst.roles +import docutils.parsers.rst.states +import docutils.statemachine + + +logger = sphinx.util.logging.getLogger(__name__) + +def to_statemachine_stringlist( + content: str, source_path: str, source_line: int = 0 +) -> docutils.statemachine.StringList: + """Converts to a docutils StringList with associated source info. + + All lines of `content` are assigned the same source info. + + :param content: Source text. + :param source_path: Path to the source file, for error messages. + :param source_line: Line number in source file, for error messages. + + :returns: The string list, which may be passed to `nested_parse`. + """ + list_lines = docutils.statemachine.string2lines(content) + items = [(source_path, source_line)] * len(list_lines) + return docutils.statemachine.StringList(list_lines, items=items) + + +def parse_rst( + state: docutils.parsers.rst.states.RSTState, + text: Union[str, docutils.statemachine.StringList], + source_path: str = "", + source_line: int = 0, +) -> List[docutils.nodes.Node]: + content = ( + to_statemachine_stringlist(text, source_path, source_line) + if isinstance(text, str) + else text + ) + with sphinx.util.docutils.switch_source_input(state, content): + node = docutils.nodes.container() + # necessary so that the child nodes get the right source/line set + node.document = state.document + state.nested_parse(content, 0, node) + return node.children + +def _replace_resolved_xrefs(node: sphinx.ext.graphviz.graphviz, code: str) -> str: + """Extracts any resolved references, and uses them to replace `xref` + attributes in the DOT code. + """ + ref_replacements = {} + + for child in node.children: + if not isinstance(child, docutils.nodes.container): + continue + xref_id = child.get("xref_id") + if xref_id is None: + continue + text = child.astext() + ref_nodes = list(child.findall(condition=docutils.nodes.reference)) + + title = None + url = None + target = None + replacement_text = f"label=<{html.escape(text)}>" + if ref_nodes: + ref_node = ref_nodes[-1] + refuri = ref_node.get("refuri") + if refuri is not None: + url = refuri or "#" + else: + url = "#" + ref_node["refid"] + title = ref_node.get("reftitle") + replacement_text += f' href="{url}"' + if title is not None: + replacement_text += f" tooltip=<{html.escape(title)}>" + target = ref_node.get("target") + if target is not None: + replacement_text += f' target="{target}"' + + ref_replacements[xref_id] = replacement_text + + if ref_replacements: + ref_pattern = "|".join(ref_replacements.keys()) + code = re.sub(ref_pattern, lambda m: ref_replacements[m.group(0)], code) + return code + + +class GraphvizConfigInfo(NamedTuple): + """Adjusted graphviz configuration. + + This extension configures Graphviz to use use the same font (from Google + Fonts) that is used for other document text. + + Note that when generating SVG output, Graphviz uses the font just to compute + the size of labels. The web browser is still responsible for loading the + font and rendering the text. Therefore, the font needs to be available both + to Graphviz and the web browser. + + Since the font already needs to be available to the web browser for other + document text, there is no added complication there. But making the font + available to Graphviz is non-trivial: + + - Graphviz supports several mechanisms for text layout: + + - LibGD, always available on Linux, sometimes on Windows, and allows TTF + font paths to be specified directly; + + - Pango and/or GDI+ (Windows only), relies on systems-specific font paths + and does not allow paths to TTF fonts to be specified directly. + + Therefore, LibGD must be used, but unfortunately, depending on how + Graphviz is built, Pango and/or GDI+ is normally used by default. + + - Graphviz does not provide any command-line options for controlling which + plugins are loaded. Instead, plugins can only be loaded through a config + file. + + Therefore, to ensure LibGD is used for fonts, it is necessary to locate the + original config file, parse it and remove the section that loads Pango, if + present, and write it to a new temporary directory. + + """ + + orig_config_path: str + """Path to original config file.""" + + new_config: bytes + """New config content with pango excluded.""" + + +def get_adjusted_graphviz_config( + app: sphinx.application.Sphinx, dot_command: str +) -> Optional[GraphvizConfigInfo]: + """Returns the graphviz configuration info for a given `dot_command`. + + The returned config file must be written to a temporary directory with a + symlink "plugins" to the `orig_config_path`. + """ + key = "_sphinx_immaterial_graphviz_adjusted_configs" + configs = getattr(app, key, None) + if configs is None: + configs = {} + setattr(app, key, configs) + config = configs.get(dot_command, False) + if config is False: + config = _make_adjusted_graphviz_config(app, dot_command) + configs[dot_command] = config + return config + + +def _get_orig_config_path(dot_command: str) -> Optional[str]: + result = subprocess.run( + [dot_command, "-v"], input="", text=True, capture_output=True, check=True + ) + + m = re.search( + r"^The plugin configuration file:\s+(.*)$\s+was successfully loaded\.$", + result.stderr, + re.MULTILINE, + ) + if m is None: + logger.error( + "Failed to determine graphviz config path from stderr: %r", result.stderr + ) + return None + + return m.group(1) + + +def _make_adjusted_graphviz_config( + app: sphinx.application.Sphinx, dot_command: str +) -> Optional[GraphvizConfigInfo]: + """Determines the graphviz libdir and generates an adjusted config. + + This is called by `get_adjusted_graphviz_config`. + """ + + orig_config_path = _get_orig_config_path(dot_command) + if orig_config_path is None: + return None + + config_content = pathlib.Path(orig_config_path).read_bytes() + + # Strip comments + config_content = re.sub(b"#[^\n]*", b"", config_content) + + new_config = io.BytesIO() + + prev_index = 0 + + def parse_error(): + logger.error( + "Failed to parse graphviz config file %r, starting at: %r", + orig_config_path, + config_content[prev_index:], + ) + + found_gd = False + + # Match plugins + for m in re.finditer( + rb"\s*([^\s{}]+)\s+([^\s{}]+)\s*(\{\s*(?:[^{}\s]+\s*\{[^{}]*\}\s*)*\s*\})\s*", + config_content, + ): + if m.start() != prev_index: + parse_error() + return None + prev_index = m.end() + plugin_path = m.group(1) + plugin_name = m.group(2) + plugin_config = m.group(3) + if plugin_name == b"gd": + found_gd = True + else: + plugin_config = re.sub(rb"\btextlayout\s*\{[^}]*\}", b"", plugin_config) + if not os.path.isabs(plugin_path): + plugin_path = os.path.join(b"plugins", plugin_path) + new_config.write(plugin_path) + new_config.write(b" ") + new_config.write(plugin_name) + new_config.write(b" ") + new_config.write(plugin_config) + + if prev_index != len(config_content): + parse_error() + return None + + if not found_gd: + if not app.config.graphviz_ignore_incorrect_font_metrics: + logger.warning( + "Incorrect font metrics will be used because " + "graphviz binary %r does not have LibGD support. This warning is expected on x86_64 Windows " + "(https://gitlab.com/graphviz/graphviz/-/issues/2267). " + "Set `graphviz_ignore_incorrect_font_metrics = True` in `conf.py` " + "to silence this warning.", + dot_command, + ) + return None + + return GraphvizConfigInfo( + orig_config_path=orig_config_path, new_config=new_config.getvalue() + ) + + +def render_dot_html( + self: Union[HTMLTranslator, HTML5Translator], + node: sphinx.ext.graphviz.graphviz, + code: str, + options: dict, + prefix: str = "graphviz", + imgcls: Optional[str] = None, + alt: Optional[str] = None, + filename: Optional[str] = None, +) -> Tuple[str, str]: + theme_options = self.builder.config["html_theme_options"] + # font: Optional[str] = None + # if isinstance(theme_options["font"], dict) and "text" in theme_options["font"]: + # # using a google font; otherwise + # font = theme_options["font"]["text"] + # else: + # raise ValueError("Font not set") + # ttf_font_paths = google_fonts.get_ttf_font_paths(self.builder.app) + # ttf_font: Optional[str] = None + # if ttf_font_paths and font is not None: + # try: + # # can only support the chosen font if cache exists and a Google font is used + # ttf_font = ttf_font_paths[(font, "400")] + # except KeyError as exc: + # # weight `400` might not exist for the specified font + # all_font_keys = [i for i in ttf_font_paths.keys() if i[0] == font] + # if not all_font_keys: + # raise FileNotFoundError( + # f"Font file for {font} could not be found in cache" + # ) from exc + # # just use first weight for the specified font + # ttf_font = ttf_font_paths[all_font_keys[0]] + + code = _replace_resolved_xrefs(node, code) + + var_replacements: Dict[str, str] = {} + replacement_to_var: Dict[str, str] = {} + + def replace_var(var_text: str) -> str: + replacement_color = var_replacements.setdefault( + var_text, "#%06x" % (0x123456 + len(var_replacements)) + ) + replacement_to_var.setdefault(replacement_color, var_text) + return replacement_color + + def replace_var_in_code(m: re.Match) -> str: + var_text = m.group(1) + replacement_color = replace_var(var_text) + return f'"{replacement_color}"' + + # fontcolor = replace_var("var(--pst-color-link)") + # fontsize = "12" + + graphviz_dot = options.get("graphviz_dot", self.builder.config.graphviz_dot) + config_info = get_adjusted_graphviz_config(self.builder.app, graphviz_dot) + + # if config_info is None: + # ttf_font = font + + command_line_options = [ + # "-Ncolor=" + replace_var("var(--md-graphviz-node-fg-color)"), + # "-Nstyle=solid,filled", + # "-Nfillcolor=" + replace_var("var(--md-graphviz-node-bg-color)"), + # "-Nfontcolor=" + fontcolor, + # "-Nfontsize=" + fontsize, + # "-Ecolor=" + replace_var("var(--md-graphviz-edge-color)"), + # "-Efontcolor=" + fontcolor, + # "-Efontsize=" + fontsize, + # "-Gbgcolor=transparent", + # "-Gcolor=" + replace_var("var(--md-graphviz-node-fg-color)"), + # "-Gfontcolor=" + fontcolor, + # "-Gfontsize=" + fontsize, + ] + # if ttf_font is not None: + # command_line_options.extend( + # [ + # "-Nfontname=" + ttf_font, + # "-Efontname=" + ttf_font, + # "-Gfontname=" + ttf_font, + # ] + # ) + + code = re.sub(r'"((?:var|calc)\s*\(.*?\))"', replace_var_in_code, code) + + dot_cmd = [graphviz_dot] + dot_cmd.extend(command_line_options) + dot_cmd.extend(self.builder.config.graphviz_dot_args) + dot_cmd.append("-Tsvg") + + with tempfile.TemporaryDirectory() as tempdir: + env = os.environ.copy() + if config_info is not None: + orig_lib_path = pathlib.Path(config_info.orig_config_path) + new_lib_dir = pathlib.Path(tempdir, "plugins") + pathlib.Path(tempdir, orig_lib_path.name).write_bytes( + config_info.new_config + ) + + env["GVBINDIR"] = tempdir + new_lib_dir.symlink_to(orig_lib_path.parent, target_is_directory=True) + cwd = str(orig_lib_path.parent) + else: + cwd = None + dot_result = subprocess.run( + dot_cmd, + input=code, + encoding="utf-8", + check=False, + env=env, + cwd=cwd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + svg_output = dot_result.stdout + errors = dot_result.stderr.strip() + if config_info is None: + # Filter warnings about fonts + errors = re.sub(r"^.*couldn\'t load font .*$", "", errors, re.MULTILINE) + errors = errors.strip() + if errors or dot_result.returncode != 0: + error_func = logger.warning if dot_result.returncode == 0 else logger.error + error_func( # type: ignore[operator] + "Error running %r with env %r: %s", dot_cmd, env, errors, location=node + ) + if dot_result.returncode != 0: + raise docutils.nodes.SkipNode + + ET.register_namespace("", "http://www.w3.org/2000/svg") + root = ET.fromstring(svg_output) + xlink_href_key = "{http://www.w3.org/1999/xlink}href" + xlink_title_key = "{http://www.w3.org/1999/xlink}title" + a_tag = "{http://www.w3.org/2000/svg}a" + text_tag = "{http://www.w3.org/2000/svg}text" + + for element in root.iter(): + style = "" + attrib = element.attrib + for attr in ("fill", "stroke"): + attrib_val = attrib.get(attr) + if attrib_val is None: + continue + var_val = replacement_to_var.get(attrib_val) + if var_val is not None: + del attrib[attr] + style += f"{attr}: {var_val};" + # font_family = attrib.get("font-family") + # if font is not None and font_family == ttf_font: # using a cached google font + # attrib["font-family"] = font + # elif font is None and font_family is not None: # using a system font (via CSS) + # attrib.pop("font-family") + # style += "font-family: var(--md-text-font-family);" + href = attrib.pop(xlink_href_key, None) + if href is not None: + attrib["href"] = href + title = attrib.pop(xlink_title_key, None) + if title is not None: + title_element = ET.Element("title") + title_element.text = title + element.append(title_element) + if element.tag == a_tag: + for child in element: + if child.tag == text_tag: + child.attrib["within_a"] = "true" + within_a = attrib.pop("within_a", None) + # if within_a: + # style += "--pst-color-link-hover: var(--pst-color-link-hover);" + if style: + attrib["style"] = style + + classes = [imgcls, "graphviz"] + node.get("classes", []) + imgcls = " ".join(filter(None, classes)) + root.attrib["class"] = (root.attrib.get("class", "") + " " + imgcls).strip() + + base_scale = 0.75 + + def convert_width_or_height(s: str): + assert s.endswith("pt") + val_pt = float(s[:-2]) + val_px = val_pt / 0.75 + val_rem = val_px / 16 * base_scale + return f"{val_rem}rem" + + root_style = "" + for attr in ("width", "height"): + attrib_val = root.attrib.pop(attr, None) + if attrib_val is not None: + root_style += f"{attr}: {convert_width_or_height(attrib_val)};" + if root_style: + root.attrib["style"] = root_style + + svg_output = ET.tostring(root, encoding="unicode") + + if alt is None: + alt = node.get("alt", self.encode(code).strip()) + if "align" in node: + self.body.append( + '
' % (node["align"], node["align"]) + ) + self.body.append(svg_output) + if "align" in node: + self.body.append("
\n") + + raise docutils.nodes.SkipNode + + +def _replace_var_refs_with_defaults(code: str) -> str: + code = re.sub(r'"var\s*\(.*?,\s*(.*)\)"', lambda m: f'"{m.group(1)}"', code) + return code + + +def on_build_finished(*args, **kwargs) -> None: + # Suppress inclusion of the graphviz.css file supplied by + # `sphinx.ext.graphviz`. This theme provides its own style rules. + pass + + +sphinx.ext.graphviz.on_config_inited = on_build_finished # type: ignore[attr-defined] +sphinx.ext.graphviz.on_build_finished = on_build_finished # type: ignore[attr-defined] + + +def _preprocess_graphviz_node( + directive: sphinx.util.docutils.SphinxDirective, + node: sphinx.ext.graphviz.graphviz, + line_offset: int, +) -> None: + code = node["code"] + + xrefs: Dict[str, Tuple[str, int]] = {} + + def replace_xref(m: re.Match) -> str: + line_index = code.count("\n", 0, m.start()) + xref_text = m.group(1).replace(r"\"", '"') + xref_index = len(xrefs) + xref_id = f"__SPHINX_IMMATERIAL_XREF_{xref_index}__" + xref_id, _ = xrefs.setdefault(xref_text, (xref_id, line_index - line_offset)) + return xref_id + + code = re.sub(r'\bxref\s*=\s*"((?:[^\\"]*|(?:\\.|"))*)"', replace_xref, code) + + node["code"] = code + + filename = node.get("filename") + + for xref_text, (xref_id, line_index) in xrefs.items(): + container = docutils.nodes.container() + container["xref_id"] = xref_id + + # Determine source location + if filename is None: + source_path, source_offset = directive.content.items[line_index] + else: + source_path = os.path.join(directive.env.app.srcdir, filename) + source_offset = line_index + + nodes = parse_rst( + state=directive.state, + text=xref_text, + source_path=source_path, + source_line=source_offset, + ) + container += nodes + node += container + + +def _monkey_patch_graphviz_directive( + directive: Type[sphinx.util.docutils.SphinxDirective], line_offset: int +): + orig_run = directive.run + + def run( + self: sphinx.util.docutils.SphinxDirective, + ) -> Sequence[docutils.nodes.Node]: + nodes = orig_run(self) + for node in nodes: + for graphviz_node in node.findall(condition=sphinx.ext.graphviz.graphviz): + _preprocess_graphviz_node(self, graphviz_node, line_offset) + return nodes + + directive.run = run # type: ignore[assignment] + + +_monkey_patch_graphviz_directive(sphinx.ext.graphviz.Graphviz, line_offset=0) +_monkey_patch_graphviz_directive(sphinx.ext.graphviz.GraphvizSimple, line_offset=2) + + +def _monkey_patch_render_dot(name: str) -> None: + orig_render = getattr(sphinx.ext.graphviz, name) + + def render_dot( + self, node: sphinx.ext.graphviz.graphviz, code: str, options: Dict, **kwargs + ): + code = _replace_resolved_xrefs(node, code) + code = _replace_var_refs_with_defaults(code) + return orig_render(self, node, code, options, **kwargs) + + setattr(sphinx.ext.graphviz, name, render_dot) + + +sphinx.ext.graphviz.render_dot_html = render_dot_html +_monkey_patch_render_dot("render_dot_texinfo") +_monkey_patch_render_dot("render_dot_latex") + +def remove_css_file(app: sphinx.application.Sphinx, filename: str): + """Removes a CSS file added by another extension.""" + app_css_files = app.registry.css_files + app_indices = [i for i, x in enumerate(app_css_files) if x[0] == filename] + for i in reversed(app_indices): + del app_css_files[i] + + if hasattr(app, "builder") and hasattr(app.builder, "add_css_file"): + builder_css_files = app.builder.css_files # type: ignore[attr-defined] + builder_indices = [ + i for i, x in enumerate(builder_css_files) if x.filename == filename + ] + for i in reversed(builder_indices): + del builder_css_files[i] + +def setup(app: sphinx.application.Sphinx) -> Dict[str, Any]: + app.setup_extension("sphinx.ext.graphviz") + app.add_config_value( + "graphviz_ignore_incorrect_font_metrics", + types=(bool,), + default=False, + rebuild="env", + ) + remove_css_file(app, "graphviz.css") + return {"parallel_read_safe": True, "parallel_write_safe": True} \ No newline at end of file diff --git a/docs/get-started.rst b/docs/get-started.rst new file mode 100644 index 0000000..c75e6f6 --- /dev/null +++ b/docs/get-started.rst @@ -0,0 +1,38 @@ +Get Started +=========== + +Ultimate Sitemap Parser can be installed from PyPI or conda-forge: + +.. tab-set:: + + .. tab-item:: pip + + .. code-block:: shell-session + + $ pip install ultimate-sitemap-parser + + .. tab-item:: conda + + .. code-block:: shell-session + + $ conda install -c conda-forge ultimate-sitemap-parser + + +Traversing a website's sitemaps and retrieving all webpages requires just a single line of code: + +.. code-block:: python + + from usp.tree import sitemap_tree_for_homepage + + tree = sitemap_tree_for_homepage('https://example.org/') + +This will return a tree representing the structure of the sitemaps. To iterate through the pages, use :func:`tree.all_pages() `. + +.. code-block:: python + + for page in tree.all_pages(): + print(page.url) + +This will output the URL of each page in the sitemap, loading the parsed representations of sitemaps `lazily to reduce memory usage `_ in very large sitemaps. + +Each page is an instance of :class:`~usp.objects.page.SitemapPage`, which will always have at least a URL and priority, and may have other attributes if present. diff --git a/docs/guides/_sitemap_examples/atom0.3.xml b/docs/guides/_sitemap_examples/atom0.3.xml new file mode 100644 index 0000000..967cf41 --- /dev/null +++ b/docs/guides/_sitemap_examples/atom0.3.xml @@ -0,0 +1,18 @@ + + + Example + + 2024-01-01 + + Page 1 + + https://example.org/page1 + 2024-01-01 + + + Page 2 + + https://example.org/page2 + 2024-01-02 + + \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/atom1.0.xml b/docs/guides/_sitemap_examples/atom1.0.xml new file mode 100644 index 0000000..4f35803 --- /dev/null +++ b/docs/guides/_sitemap_examples/atom1.0.xml @@ -0,0 +1,18 @@ + + + Example + + 2024-01-01 + + Page 1 + + https://example.org/page1 + 2024-01-01 + + + Page 2 + + https://example.org/page2 + 2024-01-02 + + \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/bbc-sitemap.dot b/docs/guides/_sitemap_examples/bbc-sitemap.dot new file mode 100644 index 0000000..ce2c32b --- /dev/null +++ b/docs/guides/_sitemap_examples/bbc-sitemap.dot @@ -0,0 +1,365 @@ +digraph G { +rankdir=LR; +s137614679169408 [label="/", shape=box]; +"s137614679169408" -> "s137614698158272"; +s137614698158272 [label="/robots.txt", shape=box]; +"s137614698158272" -> "s137614647809024"; +s137614647809024 [label="/sitemap.xml", shape=box]; +"s137614647809024" -> "s137614711841152"; +s137614711841152 [label="/sport/sitemap.xml", shape=oval]; +s137614711841152 -> p137614711841152; + +"s137614647809024" -> "s137614631855936"; +s137614631855936 [label="/news/localnews/locations/sitemap.xml", shape=oval]; +s137614631855936 -> p137614631855936; + +"s137614647809024" -> "s137614647672448"; +s137614647672448 [label="/news/politics/eu-regions/vote2014_sitemap.xml", shape=oval]; +s137614647672448 -> p137614647672448; + +"s137614647809024" -> "s137614643457152"; +s137614643457152 [label="/news/politics/councils/vote2014_sitemap.xml", shape=oval]; +s137614643457152 -> p137614643457152; + +"s137614647809024" -> "s137614647670784"; +s137614647670784 [label="/news/events/vote2014/sitemap.xml", shape=oval]; +s137614647670784 -> p137614647670784; + +"s137614647809024" -> "s137614611236736"; +s137614611236736 [label="/learningenglish/sitemap.xml", shape=oval]; +s137614611236736 -> p137614611236736; + +"s137614698158272" -> "s137614611578944"; +s137614611578944 [label="/sitemaps/https-index-uk-archive.xml", shape=box]; +"s137614611578944" -> "s137614647819584"; +s137614647819584 [label="/sitemaps/https-sitemap-uk-archive-1.xml", shape=oval]; +s137614647819584 -> p137614647819584; + +"s137614611578944" -> "s137614611125504"; +s137614611125504 [label="/sitemaps/https-sitemap-uk-archive-2.xml", shape=oval]; +s137614611125504 -> p137614611125504; + +"s137614611578944" -> "s137614611120960"; +s137614611120960 [label="/sitemaps/https-sitemap-uk-archive-3.xml", shape=oval]; +s137614611120960 -> p137614611120960; + +"s137614611578944" -> "s137614611132928"; +s137614611132928 [label="/sitemaps/https-sitemap-uk-archive-4.xml", shape=oval]; +s137614611132928 -> p137614611132928; + +"s137614611578944" -> "s137614611465472"; +s137614611465472 [label="/sitemaps/https-sitemap-uk-archive-5.xml", shape=oval]; +s137614611465472 -> p137614611465472; + +"s137614611578944" -> "s137614611467776"; +s137614611467776 [label="/sitemaps/https-sitemap-uk-archive-6.xml", shape=oval]; +s137614611467776 -> p137614611467776; + +"s137614611578944" -> "s137614595150208"; +s137614595150208 [label="/sitemaps/https-sitemap-uk-archive-7.xml", shape=oval]; +s137614595150208 -> p137614595150208; + +"s137614611578944" -> "s137614648002496"; +s137614648002496 [label="/sitemaps/https-sitemap-uk-archive-8.xml", shape=oval]; +s137614648002496 -> p137614648002496; + +"s137614611578944" -> "s137614647836992"; +s137614647836992 [label="/sitemaps/https-sitemap-uk-archive-9.xml", shape=oval]; +s137614647836992 -> p137614647836992; + +"s137614611578944" -> "s137614611134272"; +s137614611134272 [label="/sitemaps/https-sitemap-uk-archive-10.xml", shape=oval]; +s137614611134272 -> p137614611134272; + +"s137614611578944" -> "s137614611122880"; +s137614611122880 [label="/sitemaps/https-sitemap-uk-archive-11.xml", shape=oval]; +s137614611122880 -> p137614611122880; + +"s137614611578944" -> "s137614611239680"; +s137614611239680 [label="/sitemaps/https-sitemap-uk-archive-12.xml", shape=oval]; +s137614611239680 -> p137614611239680; + +"s137614611578944" -> "s137614611075328"; +s137614611075328 [label="/sitemaps/https-sitemap-uk-archive-13.xml", shape=oval]; +s137614611075328 -> p137614611075328; + +"s137614611578944" -> "s137614611125696"; +s137614611125696 [label="/sitemaps/https-sitemap-uk-archive-14.xml", shape=oval]; +s137614611125696 -> p137614611125696; + +"s137614611578944" -> "s137614611305856"; +s137614611305856 [label="/sitemaps/https-sitemap-uk-archive-15.xml", shape=oval]; +s137614611305856 -> p137614611305856; + +"s137614611578944" -> "s137614595736768"; +s137614595736768 [label="/sitemaps/https-sitemap-uk-archive-16.xml", shape=oval]; +s137614595736768 -> p137614595736768; + +"s137614611578944" -> "s137614611470016"; +s137614611470016 [label="/sitemaps/https-sitemap-uk-archive-17.xml", shape=oval]; +s137614611470016 -> p137614611470016; + +"s137614611578944" -> "s137614612660416"; +s137614612660416 [label="/sitemaps/https-sitemap-uk-archive-18.xml", shape=oval]; +s137614612660416 -> p137614612660416; + +"s137614611578944" -> "s137614611076736"; +s137614611076736 [label="/sitemaps/https-sitemap-uk-archive-19.xml", shape=oval]; +s137614611076736 -> p137614611076736; + +"s137614611578944" -> "s137614611080192"; +s137614611080192 [label="/sitemaps/https-sitemap-uk-archive-20.xml", shape=oval]; +s137614611080192 -> p137614611080192; + +"s137614611578944" -> "s137614611078784"; +s137614611078784 [label="/sitemaps/https-sitemap-uk-archive-21.xml", shape=oval]; +s137614611078784 -> p137614611078784; + +"s137614611578944" -> "s137614611080704"; +s137614611080704 [label="/sitemaps/https-sitemap-uk-archive-22.xml", shape=oval]; +s137614611080704 -> p137614611080704; + +"s137614611578944" -> "s137614611081728"; +s137614611081728 [label="/sitemaps/https-sitemap-uk-archive-23.xml", shape=oval]; +s137614611081728 -> p137614611081728; + +"s137614611578944" -> "s137614612666752"; +s137614612666752 [label="/sitemaps/https-sitemap-uk-archive-24.xml", shape=oval]; +s137614612666752 -> p137614612666752; + +"s137614611578944" -> "s137614611273088"; +s137614611273088 [label="/sitemaps/https-sitemap-uk-archive-25.xml", shape=oval]; +s137614611273088 -> p137614611273088; + +"s137614611578944" -> "s137614611080960"; +s137614611080960 [label="/sitemaps/https-sitemap-uk-archive-26.xml", shape=oval]; +s137614611080960 -> p137614611080960; + +"s137614611578944" -> "s137614611087168"; +s137614611087168 [label="/sitemaps/https-sitemap-uk-archive-27.xml", shape=oval]; +s137614611087168 -> p137614611087168; + +"s137614611578944" -> "s137614611125824"; +s137614611125824 [label="/sitemaps/https-sitemap-uk-archive-28.xml", shape=oval]; +s137614611125824 -> p137614611125824; + +"s137614611578944" -> "s137614604391616"; +s137614604391616 [label="/sitemaps/https-sitemap-uk-archive-29.xml", shape=oval]; +s137614604391616 -> p137614604391616; + +"s137614611578944" -> "s137614602720512"; +s137614602720512 [label="/sitemaps/https-sitemap-uk-archive-30.xml", shape=oval]; +s137614602720512 -> p137614602720512; + +"s137614611578944" -> "s137614611071808"; +s137614611071808 [label="/sitemaps/https-sitemap-uk-archive-31.xml", shape=oval]; +s137614611071808 -> p137614611071808; + +"s137614611578944" -> "s137614602720832"; +s137614602720832 [label="/sitemaps/https-sitemap-uk-archive-32.xml", shape=oval]; +s137614602720832 -> p137614602720832; + +"s137614611578944" -> "s137614611083520"; +s137614611083520 [label="/sitemaps/https-sitemap-uk-archive-33.xml", shape=oval]; +s137614611083520 -> p137614611083520; + +"s137614611578944" -> "s137614611267968"; +s137614611267968 [label="/sitemaps/https-sitemap-uk-archive-34.xml", shape=oval]; +s137614611267968 -> p137614611267968; + +"s137614611578944" -> "s137614611084352"; +s137614611084352 [label="/sitemaps/https-sitemap-uk-archive-35.xml", shape=oval]; +s137614611084352 -> p137614611084352; + +"s137614611578944" -> "s137614611274304"; +s137614611274304 [label="/sitemaps/https-sitemap-uk-archive-36.xml", shape=oval]; +s137614611274304 -> p137614611274304; + +"s137614611578944" -> "s137614602722112"; +s137614602722112 [label="/sitemaps/https-sitemap-uk-archive-37.xml", shape=oval]; +s137614602722112 -> p137614602722112; + +"s137614611578944" -> "s137614602726784"; +s137614602726784 [label="/sitemaps/https-sitemap-uk-archive-38.xml", shape=oval]; +s137614602726784 -> p137614602726784; + +"s137614611578944" -> "s137614611128704"; +s137614611128704 [label="/sitemaps/https-sitemap-uk-archive-39.xml", shape=oval]; +s137614611128704 -> p137614611128704; + +"s137614611578944" -> "s137614611131200"; +s137614611131200 [label="/sitemaps/https-sitemap-uk-archive-40.xml", shape=oval]; +s137614611131200 -> p137614611131200; + +"s137614611578944" -> "s137614604388160"; +s137614604388160 [label="/sitemaps/https-sitemap-uk-archive-41.xml", shape=oval]; +s137614604388160 -> p137614604388160; + +"s137614611578944" -> "s137614611086400"; +s137614611086400 [label="/sitemaps/https-sitemap-uk-archive-42.xml", shape=oval]; +s137614611086400 -> p137614611086400; + +"s137614611578944" -> "s137614611082752"; +s137614611082752 [label="/sitemaps/https-sitemap-uk-archive-43.xml", shape=oval]; +s137614611082752 -> p137614611082752; + +"s137614611578944" -> "s137614611077056"; +s137614611077056 [label="/sitemaps/https-sitemap-uk-archive-44.xml", shape=oval]; +s137614611077056 -> p137614611077056; + +"s137614611578944" -> "s137614611079680"; +s137614611079680 [label="/sitemaps/https-sitemap-uk-archive-45.xml", shape=oval]; +s137614611079680 -> p137614611079680; + +"s137614611578944" -> "s137614611076864"; +s137614611076864 [label="/sitemaps/https-sitemap-uk-archive-46.xml", shape=oval]; +s137614611076864 -> p137614611076864; + +"s137614611578944" -> "s137614602721792"; +s137614602721792 [label="/sitemaps/https-sitemap-uk-archive-47.xml", shape=oval]; +s137614602721792 -> p137614602721792; + +"s137614611578944" -> "s137614611084224"; +s137614611084224 [label="/sitemaps/https-sitemap-uk-archive-48.xml", shape=oval]; +s137614611084224 -> p137614611084224; + +"s137614611578944" -> "s137614611304896"; +s137614611304896 [label="/sitemaps/https-sitemap-uk-archive-49.xml", shape=oval]; +s137614611304896 -> p137614611304896; + +"s137614611578944" -> "s137614611475072"; +s137614611475072 [label="/sitemaps/https-sitemap-uk-archive-50.xml", shape=oval]; +s137614611475072 -> p137614611475072; + +"s137614698158272" -> "s137614675112448"; +s137614675112448 [label="/sitemaps/https-index-uk-news.xml", shape=box]; +"s137614675112448" -> "s137614630885120"; +s137614630885120 [label="/sitemaps/https-sitemap-uk-news-1.xml", shape=oval]; +s137614630885120 -> p137614630885120; + +"s137614675112448" -> "s137614612674752"; +s137614612674752 [label="/sitemaps/https-sitemap-uk-news-2.xml", shape=oval]; +s137614612674752 -> p137614612674752; + +"s137614698158272" -> "s137614631504256"; +s137614631504256 [label="/food/sitemap.xml", shape=oval]; +s137614631504256 -> p137614631504256; + +"s137614698158272" -> "s137614670741248"; +s137614670741248 [label="/bitesize/sitemap/sitemapindex.xml", shape=box]; +"s137614670741248" -> "s137614611268096"; +s137614611268096 [label="/bitesize/sitemap/sitemapindex_part1.xml", shape=oval]; +s137614611268096 -> p137614611268096; + +"s137614670741248" -> "s137614587353536"; +s137614587353536 [label="/bitesize/sitemap/sitemapindex_part2.xml", shape=oval]; +s137614587353536 -> p137614587353536; + +"s137614698158272" -> "s137614627379072"; +s137614627379072 [label="/teach/sitemap/sitemapindex.xml", shape=oval]; +s137614627379072 -> p137614627379072; + +"s137614698158272" -> "s137614701232576"; +s137614701232576 [label="/sitemaps/https-index-uk-archive_video.xml", shape=box]; +"s137614701232576" -> "s137614599090816"; +s137614599090816 [label="/sitemaps/https-sitemap-uk-archive_video-1.xml", shape=oval]; +s137614599090816 -> p137614599090816; + +"s137614698158272" -> "s137614604678784"; +s137614604678784 [label="/sitemaps/https-index-uk-video.xml", shape=box]; +"s137614604678784" -> "s137614604670656"; +s137614604670656 [label="/sitemaps/https-sitemap-uk-video-1.xml", shape=oval]; +s137614604670656 -> p137614604670656; + +"s137614698158272" -> "s137614666594880"; +s137614666594880 [label="/sitemaps/sitemap-uk-ws-topics.xml", shape=oval]; +s137614666594880 -> p137614666594880; + +"s137614698158272" -> "s137614583722752"; +s137614583722752 [label="/sport/sitemap.xml", shape=oval]; +s137614583722752 -> p137614583722752; + +"s137614698158272" -> "s137614583710144"; +s137614583710144 [label="/sitemaps/sitemap-uk-topics.xml", shape=oval]; +s137614583710144 -> p137614583710144; + +"s137614698158272" -> "s137614652409536"; +s137614652409536 [label="/ideas/sitemap.xml", shape=oval]; +s137614652409536 -> p137614652409536; + +"s137614698158272" -> "s137614583211392"; +s137614583211392 [label="/tiny-happy-people/sitemap/sitemapindex.xml", shape=oval]; +s137614583211392 -> p137614583211392; + +{rank=same; +p137614711841152 [label="43 pages", shape=plain]; +p137614631855936 [label="17752 pages", shape=plain]; +p137614647672448 [label="12 pages", shape=plain]; +p137614643457152 [label="204 pages", shape=plain]; +p137614647670784 [label="0 pages", shape=plain]; +p137614611236736 [label="11134 pages", shape=plain]; +p137614647819584 [label="50000 pages", shape=plain]; +p137614611125504 [label="50000 pages", shape=plain]; +p137614611120960 [label="50000 pages", shape=plain]; +p137614611132928 [label="50000 pages", shape=plain]; +p137614611465472 [label="50000 pages", shape=plain]; +p137614611467776 [label="50000 pages", shape=plain]; +p137614595150208 [label="50000 pages", shape=plain]; +p137614648002496 [label="50000 pages", shape=plain]; +p137614647836992 [label="50000 pages", shape=plain]; +p137614611134272 [label="50000 pages", shape=plain]; +p137614611122880 [label="50000 pages", shape=plain]; +p137614611239680 [label="50000 pages", shape=plain]; +p137614611075328 [label="50000 pages", shape=plain]; +p137614611125696 [label="50000 pages", shape=plain]; +p137614611305856 [label="50000 pages", shape=plain]; +p137614595736768 [label="50000 pages", shape=plain]; +p137614611470016 [label="50000 pages", shape=plain]; +p137614612660416 [label="50000 pages", shape=plain]; +p137614611076736 [label="50000 pages", shape=plain]; +p137614611080192 [label="50000 pages", shape=plain]; +p137614611078784 [label="50000 pages", shape=plain]; +p137614611080704 [label="50000 pages", shape=plain]; +p137614611081728 [label="50000 pages", shape=plain]; +p137614612666752 [label="50000 pages", shape=plain]; +p137614611273088 [label="50000 pages", shape=plain]; +p137614611080960 [label="50000 pages", shape=plain]; +p137614611087168 [label="50000 pages", shape=plain]; +p137614611125824 [label="50000 pages", shape=plain]; +p137614604391616 [label="50000 pages", shape=plain]; +p137614602720512 [label="50000 pages", shape=plain]; +p137614611071808 [label="50000 pages", shape=plain]; +p137614602720832 [label="50000 pages", shape=plain]; +p137614611083520 [label="50000 pages", shape=plain]; +p137614611267968 [label="50000 pages", shape=plain]; +p137614611084352 [label="50000 pages", shape=plain]; +p137614611274304 [label="50000 pages", shape=plain]; +p137614602722112 [label="50000 pages", shape=plain]; +p137614602726784 [label="50000 pages", shape=plain]; +p137614611128704 [label="50000 pages", shape=plain]; +p137614611131200 [label="50000 pages", shape=plain]; +p137614604388160 [label="50000 pages", shape=plain]; +p137614611086400 [label="50000 pages", shape=plain]; +p137614611082752 [label="50000 pages", shape=plain]; +p137614611077056 [label="50000 pages", shape=plain]; +p137614611079680 [label="50000 pages", shape=plain]; +p137614611076864 [label="50000 pages", shape=plain]; +p137614602721792 [label="50000 pages", shape=plain]; +p137614611084224 [label="50000 pages", shape=plain]; +p137614611304896 [label="50000 pages", shape=plain]; +p137614611475072 [label="20973 pages", shape=plain]; +p137614630885120 [label="881 pages", shape=plain]; +p137614612674752 [label="213 pages", shape=plain]; +p137614631504256 [label="21782 pages", shape=plain]; +p137614611268096 [label="50000 pages", shape=plain]; +p137614587353536 [label="9825 pages", shape=plain]; +p137614627379072 [label="6597 pages", shape=plain]; +p137614599090816 [label="22448 pages", shape=plain]; +p137614604670656 [label="63 pages", shape=plain]; +p137614666594880 [label="20259 pages", shape=plain]; +p137614583722752 [label="43 pages", shape=plain]; +p137614583710144 [label="1094 pages", shape=plain]; +p137614652409536 [label="867 pages", shape=plain]; +p137614583211392 [label="1181 pages", shape=plain]; +} +} diff --git a/docs/guides/_sitemap_examples/class-tree.dot b/docs/guides/_sitemap_examples/class-tree.dot new file mode 100644 index 0000000..2fd9c48 --- /dev/null +++ b/docs/guides/_sitemap_examples/class-tree.dot @@ -0,0 +1,4 @@ +digraph G { + root [label="IndexWebsiteSitemap"] + robots [label="IndexRobots +} \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/google-news.xml b/docs/guides/_sitemap_examples/google-news.xml new file mode 100644 index 0000000..baad2df --- /dev/null +++ b/docs/guides/_sitemap_examples/google-news.xml @@ -0,0 +1,29 @@ + + + + + https://example.org/news/one + + + + Example.org News + + 2024-01-01 + News Article One + + + + + https://example.org/news/two + + + + Example.org News + + 2024-01-02 + News Article Two + + + \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/plaintext.txt b/docs/guides/_sitemap_examples/plaintext.txt new file mode 100644 index 0000000..3e2c98c --- /dev/null +++ b/docs/guides/_sitemap_examples/plaintext.txt @@ -0,0 +1,4 @@ +Yes, there are plain text sitemaps and they could just contain random text. + +https://example.org/page1 +https://example.org/page2 \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/rss2.0.xml b/docs/guides/_sitemap_examples/rss2.0.xml new file mode 100644 index 0000000..21d702a --- /dev/null +++ b/docs/guides/_sitemap_examples/rss2.0.xml @@ -0,0 +1,18 @@ + + + + Example + https://example.org/ + Example + + Page 1 + https://example.org/page1 + 2024-01-01 + + + Page 2 + https://example.org/page2 + 2024-01-02 + + + \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/simple-index.xml b/docs/guides/_sitemap_examples/simple-index.xml new file mode 100644 index 0000000..bb7e101 --- /dev/null +++ b/docs/guides/_sitemap_examples/simple-index.xml @@ -0,0 +1,11 @@ + + + + https://example.org/sitemap1.xml + 2024-01-01 + + + https://example.org/sitemap2.xml + 2024-01-02 + + \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/simple-urlset.xml b/docs/guides/_sitemap_examples/simple-urlset.xml new file mode 100644 index 0000000..dd96ad5 --- /dev/null +++ b/docs/guides/_sitemap_examples/simple-urlset.xml @@ -0,0 +1,11 @@ + + + + https://example.org/page1 + 2024-01-01 + + + https://example.org/page2 + 2024-01-02 + + \ No newline at end of file diff --git a/docs/guides/fetch-parse.rst b/docs/guides/fetch-parse.rst new file mode 100644 index 0000000..5695a0a --- /dev/null +++ b/docs/guides/fetch-parse.rst @@ -0,0 +1,56 @@ +Fetch and Parse Process +======================= + +When calling :func:`~usp.tree.sitemap_tree_for_homepage`, USP will try several methods to find sitemaps and recurse through sub-sitemaps. + +Broadly the process is as follows: + +1. Attempt to fetch ``https://example.org/robots.txt`` and parse for ``Sitemap:`` statements. We consider ``robots.txt`` to be an index-type sitemap (as it lists other sitemaps) +2. Fetch and parse each discovered sitemap URL. If a sitemap is an index-type sitemap, recurse into it. +3. Try to fetch known sitemap locations like ``/sitemap.xml`` and ``/sitemap_index.xml``, excluding those already declared in ``robots.txt``. +4. Create a top-level dummy sitemap to act as the parent of ``robots.txt`` and discovered sitemaps. + +.. seealso:: + :class: sidebar + + :doc:`Reference of formats, parsing and representation classes ` + +All fetching is done through the :class:`~usp.fetch_parse.SitemapFetcher` class, which is responsible for fetching and choosing the appropriate parser for the content. + +The fetcher then attempts to parse using the process shown in this flowchart: + +.. dropdown:: Show Parse Flowchart + :icon: workflow + :name: fig-parse-flow + + .. graphviz:: parse_flow.dot + +Non-XML documents are parsed directly with their respective parser. For XML documents, the :class:`~usp.fetch_parse.XMLSitemapParser` parses the document to determine the type of the XML document and select the appropriate parser (the *concrete parser*) to actually extract information. + +XML documents are detected with a heuristic (the document, when leading whitespace is trimmed, starts with ``<``) to avoid issues with incorrect content types. + +Index-type parsers instantiate the appropriate class from :mod:`usp.objects.sitemap` and another :class:`~usp.fetch_parse.SitemapFetcher` to fetch each of their children. This allows a sitemap of one type (e.g. robots.txt) to contain sitemaps of another type (e.g. XML). Duplicate declarations of sub-sitemaps within the same index-type sitemap are ignored, but otherwise order is preserved. + +Page-type parsers instantiate the appropriate class from :mod:`usp.objects.sitemap` and instantiate instances of their internal page class (e.g. :class:`PagesXMLSitemapParser.Page `). These are not converted to the public class :class:`~usp.objects.page.SitemapPage` until the end of the fetch process. The order sub-sitemaps or pages are declare in is preserved. + + +.. _process_tree_construction: + +Tree Construction +----------------- + +.. seealso:: + + :doc:`/guides/sitemap-tree` + +Each parser instance returns an object inheriting from :class:`~usp.objects.sitemap.AbstractSitemap` after the parse process (including any child fetch-and-parses), constructing the tree from the bottom up. The top :class:`~usp.objects.sitemap.IndexWebsiteSitemap` is then created to act as the parent of ``robots.txt`` and all well-known-path discovered sitemaps. + +.. _process_dedup: + +Deduplication +------------- + +During the parse process, some de-duplication is performed within each individual sitemap. In an index sitemap, only the first declaration of a sub-sitemap is fetched. In a page sitemap, only the first declaration of a page is included. + +However, this means that if a sub-sitemap is declared in multiple index sitemaps, or a page is declared in multiple page sitemaps, it will be included multiple times. + diff --git a/docs/guides/parse_flow.dot b/docs/guides/parse_flow.dot new file mode 100644 index 0000000..5f24db8 --- /dev/null +++ b/docs/guides/parse_flow.dot @@ -0,0 +1,142 @@ +strict digraph { + + fetch [ + shape = "rect" + group=main + xref=":class:`~usp.fetch_parse.SitemapFetcher`" + label= "Fetch with\nSitemapFetcher" + ] + + httpsucc [ + label="Did request\nsucceed?" + shape=diamond; + group=main + + ] + + fetch -> httpsucc + + returninvalid [ + xref=":class:`~usp.objects.sitemap.InvalidSitemap`" + label = "Return\nInvalidSitemap" + ] + + httpsucc -> returninvalid [label="No"] + + { + rank=same; + httpsucc; returninvalid; userobots; + } + + seemsxml [ + label="Starts with\n '<' char?" + shape=diamond + group=main + ] + httpsucc -> seemsxml [label="Yes"] + + isrobots [ + label="URL ends with\nrobots.txt?" + shape=diamond + group=robots + ] + seemsxml -> isrobots [label="No"] + + {rank=same; + seemsxml;isrobots; + } + + useplaintext [ + label="PlainTextSitemapParser" + group=robots + xref=":class:`~usp.fetch_parse.PlainTextSitemapParser`" + ] + + isrobots -> useplaintext [label="No"] + + userobots [ + xref=":class:`~usp.fetch_parse.IndexRobotsTxtSitemapParser`" + label="IndexRobotsTxtSitemapParser" + group=robots + ] + isrobots -> userobots [label="Yes"] + + + xmlparse [ + xref=":class:`~usp.fetch_parse.XMLSitemapParser`" + label="XMLSitemapParser", + shape="parallelogram" + group=main + ] + seemsxml -> xmlparse [label="Yes"] + + isurlset [ + label="Has\n?" + shape="diamond" + ] + xmlparse -> isurlset + + concretepages [ + xref=":class:`~usp.fetch_parse.PagesXMLSitemapParser`" + label="PagesXMLSitemapParser" + ] + isurlset -> concretepages [label="Yes"] + + {rank=same; + isurlset;concretepages; + } + + issitemapindex [ + label="Has\n?" + shape=diamond + ] + isurlset -> issitemapindex [label="No"] + + concreteindex [ + xref=":class:`~usp.fetch_parse.IndexXMLSitemapParser`" + label="IndexXMLSitemapParser" + ] + issitemapindex -> concreteindex [label="Yes"] + + {rank=same; + issitemapindex;concreteindex + } + + isrss [ + label="Has\n?" + shape=diamond + ] + issitemapindex -> isrss [label="No"] + + concreterss [ + xref=":class:`~usp.fetch_parse.PagesRSSSitemapParser`" + label="PagesRSSSitemapParser" + ] + isrss -> concreterss [label="Yes"] + + {rank=same; + isrss; concreterss; + } + + isatom [ + label="Has\n?" + shape=diamond + ] + isrss -> isatom [label="No"] + + concreteatom [ + xref=":class:`~usp.fetch_parse.PagesAtomSitemapParser`" + label="PagesAtomSitemapParser" + ] + isatom -> concreteatom [label="Yes"] + + {rank=same; + isatom; concreteatom; + } + + error [ + xref=":class:`~usp.exceptions.SitemapXMLParsingException`" + label="Raise\nSitemapXMLParsingException" + ] + isatom -> error [label="No"] +} \ No newline at end of file diff --git a/docs/guides/performance.rst b/docs/guides/performance.rst new file mode 100644 index 0000000..7de9b16 --- /dev/null +++ b/docs/guides/performance.rst @@ -0,0 +1,55 @@ +Performance +=========== + +USP is able to parse even very large and complex sitemaps very quickly and in a memory-efficient fashion. + +As an example, USP is able to parse the sitemap of the BBC website, which contains 2.6m URLS across 75 sitemaps, in less than a minute (excluding HTTP test times), and using approximately 90MiB of memory at peak. + + +XML Parse Efficiency +-------------------- + +For XML documents, USP uses the :external+python:doc:`Expat parser ` for high performance parsing of documents without requiring them to be strictly correct. As it is a stream-based parser, USP is able to hook its sitemap parsing into the XML parse process, opposed to having to parse the entire document and then work on the parse tree. + +Memory Efficiency +----------------- + +When constructing the :ref:`sitemap-page tree ` only the sitemap part of the tree is constantly stored in memory. During instantiation of the :class:`~usp.objects.sitemap.AbstractPagesSitemap`, page data is swapped into a temporary file on disk, and only loaded into memory when its pages are accessed. + +.. _performance_page_generator: + +Page Generator +^^^^^^^^^^^^^^ + +Due to the swapping process, it is most efficient to use the iterator returned by :func:`~usp.objects.sitemap.AbstractSitemap.all_pages` directly. This will load one sitemap's pages into memory at once, rather than all simultaneously. + +.. grid:: 1 1 2 2 + :padding: 0 + + .. grid-item-card:: + :class-item: code-card + :class-header: sd-bg-success sd-bg-text-success sd-outline-success + :class-card: sd-outline-success + + :octicon:`check` Good Practice + ^^^^ + + .. code-block:: + + for page in tree.all_pages(): + print(page.url) + + .. grid-item-card:: + :class-item: code-card + :class-header: sd-bg-warning sd-bg-text-warning sd-outline-warning + :class-card: sd-outline-warning + + :octicon:`alert` Avoid + ^^^ + + .. code-block:: + + for page in list(tree.all_pages()): + print(page.url) + +Of course, in some cases, this is unavoidable. Even so, USP is still relatively memory efficient - for the BBC website the entire page list consumes approximately 560MiB of memory (compared to the plaintext files which are approximately 370MiB). \ No newline at end of file diff --git a/docs/guides/saving.rst b/docs/guides/saving.rst new file mode 100644 index 0000000..5588402 --- /dev/null +++ b/docs/guides/saving.rst @@ -0,0 +1,75 @@ +Saving +====== + +USP supports two methods of serialisation: conversion to a dictionary or Pickle format. + +As Dictionary +------------- + +Trees and pages can be converted into a dictionary with the :meth:`AbstractSitemap.to_dict() `/:meth:`SitemapPage.to_dict() ` methods. + +For example: + +.. code-block:: py + + tree = sitemap_tree_for_homepage('https://www.example.org/') + + # Complete tree representation with pages + tree.to_dict() + + # Tree representation without pages + tree.to_dict(include_pages=False) + + # Pages only + [page.to_dict() for page in tree.all_pages()] + +This could then be used with another library for data manipulation, such as Pandas: + +.. code-block:: py + + data = [page.to_dict() for page in tree.all_pages()] + # pd.json_normalize() flattens the nested key for news stories + # to dot-separated keys + pages_df = pd.DataFrame(pd.json_normalize(data)) + + pages_df.to_csv('sitemap-pages.csv', index=False) + + +As Pickle +--------- + +If you need to save the tree object itself in a format which can be loaded back later, use the :external+python:doc:`Pickle format`. The :class:`~usp.objects.sitemap.AbstractPagesSitemap` class implements custom pickling behaviour to load pages from disk, and unpickling behaviour to save them back. + +.. danger:: + + Loading Pickle data from untrusted sources can be unsafe. See the :external+python:doc:`pickle module documentation ` + +.. danger:: + + Pickling and unpickling relies on the internal private API of USP **which may change in future versions, even if the public API remains the same**. Attempting to load a pickled tree from a different version of USP may result in errors or incorrect data. + +.. warning:: + + All pages will need to be loaded into memory to pickle/unpickle the tree, so be cautious with very large sitemaps. + + +.. code-block:: py + + import pickle + from usp.tree import sitemap_tree_for_homepage + + tree = sitemap_tree_for_homepage('https://www.example.org/') + + with open('sitemap.pickle', 'wb') as f: + pickle.dump(tree, f) + + # This will delete the temporary files used to store the pages of the tree + del tree + + # Later, to load the tree back + + with open('sitemap.pickle', 'rb') as f: + tree = pickle.load(f) + + for page in tree.all_pages(): + print(page.url) diff --git a/docs/guides/security.rst b/docs/guides/security.rst new file mode 100644 index 0000000..2d6a186 --- /dev/null +++ b/docs/guides/security.rst @@ -0,0 +1,20 @@ +Security +======== + +There is inherently some risk in downloading untrusted content from the web and parsing it. + +The Expat XML parser documentation contains the following security warning: + +.. warning:: + + The pyexpat module is not secure against maliciously constructed data. If you need to parse untrusted or unauthenticated data see :external+python:ref:`xml-vulnerabilities`. + +USP minimally parses documents, so should avoid many of the risks seen in more complex parsers. Nevertheless, it is advisable to check the version of ``pyexpat`` against the notes listed in the mentioned section. + +.. code-block:: python-console + + >>> import pyexpat + >>> pyexpat.EXPAT_VERSION + 'expat_2.4.7' + +It is recommended to use a version greater than 2.4.0, which should be included in all recent Python versions. \ No newline at end of file diff --git a/docs/guides/sitemap-tree.rst b/docs/guides/sitemap-tree.rst new file mode 100644 index 0000000..9566cbc --- /dev/null +++ b/docs/guides/sitemap-tree.rst @@ -0,0 +1,170 @@ +Sitemap Tree +============ + +Calling :func:`~usp.tree.sitemap_tree_for_homepage` will return the root node of a tree representing the structure of the sitemaps found on a website. + +Index vs Page Sitemaps +---------------------- + +A small site may just have a single sitemap hosted at ``/sitemap.xml``, but larger sites often use a more complex structure. By convention, sitemaps are limited to 50,000 URLs or 50MB each, so large sites will have to split sitemaps. It's also common to split sitemaps semantically, such as by language or content type. + +Sitemaps are divided into two types: + +- **Index sitemaps** list other sitemaps, which may themselves be index sitemaps or page sitemaps +- **Page sitemaps** list pages + +On a more complex site, in order to find all pages, you would have to fetch the index sitemaps (potentially several levels deep) and then fetch the page sitemaps they reference. + +Basic Examples +-------------- + +A small site with a single sitemap located at ``/sitemap.xml`` would look like this: + +.. note:: + + In diagrams like these, square boxes represent index sitemaps and rounded boxes represent page sitemaps. In reality, each page-type sitemap will have a list of pages as its children, but these are omitted for brevity. + + Nodes are clickable to access the documentation for that class. + +.. graphviz:: + :name: fig-simple-sitemap + :align: center + + digraph G { + root [ + shape=record, + xref=":class:`~usp.objects.sitemap.IndexWebsiteSitemap`", + label="{IndexWebsiteSitemap|/}" + ] + sitemap [ + shape=record, + xref=":class:`~usp.objects.sitemap.PagesXMLSitemap`", + label="{PagesXMLSitemap|/sitemap.xml}", + style=rounded + ] + sitemap_p [label="Pages", shape=plain] + root -> sitemap -> sitemap_p + } + +In this case, the sitemap was discovered because it was at a well-known URL. USP has a built-in list (:data:`usp.tree._UNPUBLISHED_SITEMAP_PATHS`) of common sitemap locations to check. + +Additionally, USP checks the site's ``robots.txt`` file for a sitemap directive. Had the sitemap been declared in ``robots.txt`` instead, the tree would look like this: + +.. graphviz:: + :name: fig-simple-robots + :align: center + + digraph G { + root [ + shape=record, + xref=":class:`~usp.objects.sitemap.IndexWebsiteSitemap`", + label="{IndexWebsiteSitemap|/}" + ] + robots [ + shape=record, + xref=":class:`~usp.objects.sitemap.IndexRobotsTxtSitemap`", + label="{IndexRobotsTxtSitemap|/robots.txt}" + ] + sitemap [ + shape=record, + xref=":class:`~usp.objects.sitemap.PagesXMLSitemap`", + label="{PagesXMLSitemap|/sitemap.xml}", + style=rounded + ] + sitemap_p [label="Pages", shape=plain] + root -> robots -> sitemap -> sitemap_p + } + +The sitemap is now a child of the ``robots.txt`` file (which we treat as a type of index sitemap) because it's queried first, and well-known URLs are skipped if they've already been retrieved through ``robots.txt``. + +Finally, in this third example, the site has sitemaps listed in ``robots.txt`` and some additional sitemaps at well-known URLs: + +.. graphviz:: + :name: fig-sitemap-hierarchy + :align: center + + digraph G { + node [shape=record]; + root [ + xref=":class:`~usp.objects.sitemap.IndexWebsiteSitemap`", + label="{IndexWebsiteSitemap|/}" + ] + + root -> robots + + robots [ + xref=":class:`~usp.objects.sitemap.IndexRobotsTxtSitemap`", + label="{IndexRobotsTxtSitemap|/robots.txt}" + ] + + sitemap [ + xref=":class:`~usp.objects.sitemap.PagesXMLSitemap`", + label="{PagesXMLSitemap|/sitemap.xml}", + style=rounded + ] + sitemap_p [label="Pages", shape=plain] + robots -> sitemap -> sitemap_p + + root -> news_index + news_index [ + xref=":class:`~usp.objects.sitemap.IndexXMLSitemap`", + label="{IndexXMLSitemap|/sitemap_news.xml}" + ] + news_1 [ + xref=":class:`~usp.objects.sitemap.AbstractPagesSitemap`", + label="{PagesXMLSitemap|/sitemap_news_1.xml}", + style=rounded + ] + news_1p [label="Pages", shape=plain] + news_index -> news_1 -> news_1p + news_2 [ + xref=":class:`~usp.objects.sitemap.PagesXMLSitemap`" + label="{PagesXMLSitemap|/sitemap_news_2.xml}", + style=rounded + ] + news_2p [label="Pages", shape=plain] + news_index -> news_2 -> news_2p + news_3 [ + xref=":class:`~usp.objects.sitemap.PagesXMLSitemap`", + label="{PagesXMLSitemap|/sitemap_news_3.xml}", + style=rounded + ] + news_3p [label="Pages", shape=plain] + news_index -> news_3 -> news_3p + } + +Here, ``sitemap_news.xml`` is an example of an XML index sitemap, which contains no pages itself, but just points to 3 sub-sitemaps. It should also be clearer from this example why it's necessary to add the root node to combine the sitemaps found from ``robots.txt`` and well-known URLs. + +Sitemap trees will always have an :class:`~.IndexWebsiteSitemap` at the root, and will usually consist of :class:`~.IndexXMLSitemap` and :class:`~.PagesXMLSitemap` (either directly or through a :class:`~.IndexRobotsTxtSitemap`), but :doc:`other sitemap types are possible `. Regardless, all sitemap classes implement the same interface (:class:`~.AbstractIndexSitemap` or :class:`~.AbstractPagesSitemap`, which both inherit from :class:`~.AbstractSitemap`), so the actual type of sitemap is not important for most use cases. + + +Real-World Example +------------------ + +Large and well-established sites (e.g. media outlets) may have very complex sitemap hierarchies, due to the amount of content and changing technologies for the site. For example, this is the sitemap hierarchy for the BBC website: + +.. dropdown:: bbc.co.uk Sitemap Graph + + .. graphviz:: _sitemap_examples/bbc-sitemap.dot + +Altogether, this sitemap tree contains 2.6 million URLs spread across 75 sitemaps. The ``robots.txt`` file declares 13 sitemaps, some of which are index sitemaps with as many as 50 page sitemaps. Despite this, USP is able to parse this tree in less than a minute and using no more than 90MiB of memory at peak. + + +Note also that there is some duplication in this tree. The sitemap ``/sport/sitemap.xml`` is both directly declared in ``robots.txt`` and also in the index sitemap ``/sitemap.xml``. As these declarations are in different sitemap files, they are both included in the tree. Likewise, the pages declared in the ``/sport/sitemap.xml`` file are included in the tree twice. See the section on :ref:`process_dedup` for details. + +Traversal +--------- + +To traverse the sitemaps and pages in the tree, :class:`~usp.objects.sitemap.AbstractSitemap` declares an interface to access the immediate children of a sitemap node through properties, or all descendants through methods. + +These methods and properties are always implemented, returning or yielding empty lists where not applicable (e.g. accessing sub-sitemaps on a page sitemap, or either sub-sitemaps or pages on an invalid sitemap), meaning they can be called without checking the type of the sitemap. + +For sub-sitemaps: + +- :attr:`AbstractSitemap.sub_sitemaps ` is a list of the direct children of that sitemap +- :meth:`AbstractSitemap.all_sitemaps() ` returns an iterator yielding all descendant sitemaps (depth-first) + +For pages: + +- :attr:`AbstractSitemap.pages ` is a list of the direct children of that sitemap +- :meth:`AbstractSitemap.all_pages() ` returns an iterator yielding all descendant pages (depth-first) diff --git a/docs/index.rst b/docs/index.rst index f58294e..5483570 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,14 +1,92 @@ +:hide-toc: + Ultimate Sitemap Parser ======================= .. toctree:: - :maxdepth: 2 + :hidden: + + get-started + +.. toctree:: + :hidden: + :caption: Guides + + guides/sitemap-tree + guides/fetch-parse + guides/saving + guides/performance + guides/security + +.. toctree:: + :hidden: + :caption: Reference + + Supported Formats + Python API + CLI + +.. toctree:: + :hidden: + :caption: About + + changelog + acknowledgements + GitHub + PyPI + Issues + + + +Ultimate Sitemap Parser (USP) is a performant and robust Python library for parsing and crawling sitemaps. + +- **Supports all sitemap formats**: Sitemap XML, Google News, plain text, RSS 2.0, Atom 0.3/1.0. + +- **Error-tolerant**: Handles common sitemap bugs gracefully. + +- **Automatic sitemap discovery**: Finds sitemaps from *robots.txt* and from common sitemap names. + +- **Fast and memory efficient**: Uses Expat XML parsing, doesn't consume much memory even with massive sitemap hierarchies. Swaps and lazily loads sub-sitemaps to disk. + +- **Field-tested with ~1 million URLs**: Originally developed for the `Media Cloud `_ project where it was used to parse approximately 1 million sitemaps. + + +Installation +------------ + +Ultimate Sitemap Parser can be installed from PyPI or conda-forge: + +.. tab-set:: + + .. tab-item:: pip + + .. code-block:: shell-session + + $ pip install ultimate-sitemap-parser + + .. tab-item:: conda + + .. code-block:: shell-session + + $ conda install -c conda-forge ultimate-sitemap-parser + +Usage +----- + +USP is very easy to use, with just a single line of code it can traverse and parse a website's sitemaps: + +.. code-block:: python + + from usp.tree import sitemap_tree_for_homepage + + tree = sitemap_tree_for_homepage('https://www.example.org/') - modules + for page in tree.all_pages(): + print(page.url) -Indices and tables -================== +Advanced Features +----------------- -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` +- :doc:`CLI Client `: Use the ``usp ls`` tool to work with sitemaps from the command line. +- :doc:`Serialisation `: Export raw data or save to disk and load later +- Custom web clients: Instead of the default client built on `requests `_ you can use your own web client by implementing the :class:`~usp.web_client.abstract_client.AbstractWebClient` interface. \ No newline at end of file diff --git a/docs/modules.rst b/docs/modules.rst deleted file mode 100644 index db5c20f..0000000 --- a/docs/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -usp -=== - -.. toctree:: - :maxdepth: 4 - - usp diff --git a/docs/reference/api/index.rst b/docs/reference/api/index.rst new file mode 100644 index 0000000..2231305 --- /dev/null +++ b/docs/reference/api/index.rst @@ -0,0 +1,13 @@ +Python API Reference +==================== + + +.. toctree:: + :titlesonly: + + usp.exceptions + usp.fetch_parse + usp.helpers + usp.objects + usp.tree + usp.web_client \ No newline at end of file diff --git a/docs/reference/api/usp.exceptions.rst b/docs/reference/api/usp.exceptions.rst new file mode 100644 index 0000000..4a8dcd5 --- /dev/null +++ b/docs/reference/api/usp.exceptions.rst @@ -0,0 +1,5 @@ +usp.exceptions +============== + +.. automodule:: usp.exceptions + :members: \ No newline at end of file diff --git a/docs/reference/api/usp.fetch_parse.rst b/docs/reference/api/usp.fetch_parse.rst new file mode 100644 index 0000000..2279a7b --- /dev/null +++ b/docs/reference/api/usp.fetch_parse.rst @@ -0,0 +1,42 @@ +usp.fetch_parse +=============== + +.. automodule:: usp.fetch_parse + +.. autoclass:: SitemapFetcher + :members: + +.. autoclass:: AbstractSitemapParser + :members: + +.. autoclass:: IndexRobotsTxtSitemapParser + :members: + :show-inheritance: + +.. autoclass:: PlainTextSitemapParser + :members: + :show-inheritance: + +.. autoclass:: XMLSitemapParser + :members: + :show-inheritance: + +.. autoclass:: AbstractXMLSitemapParser + :members: + +.. autoclass:: IndexXMLSitemapParser + :members: + :show-inheritance: + +.. autoclass:: PagesXMLSitemapParser + :members: + :show-inheritance: + +.. autoclass:: PagesRSSSitemapParser + :members: + :show-inheritance: + +.. autoclass:: PagesAtomSitemapParser + :members: + :show-inheritance: + diff --git a/docs/reference/api/usp.helpers.rst b/docs/reference/api/usp.helpers.rst new file mode 100644 index 0000000..d6f1cb2 --- /dev/null +++ b/docs/reference/api/usp.helpers.rst @@ -0,0 +1,5 @@ +usp.helpers +=========== + +.. automodule:: usp.helpers + :members: diff --git a/docs/reference/api/usp.objects.page.rst b/docs/reference/api/usp.objects.page.rst new file mode 100644 index 0000000..bde277a --- /dev/null +++ b/docs/reference/api/usp.objects.page.rst @@ -0,0 +1,14 @@ +usp.objects.page +================ +.. automodule:: usp.objects.page + +.. autoclass:: SitemapPage + :members: + +.. autoclass:: SitemapPageChangeFrequency + :members: + :undoc-members: + +.. autoclass:: SitemapNewsStory + :members: + diff --git a/docs/reference/api/usp.objects.rst b/docs/reference/api/usp.objects.rst new file mode 100644 index 0000000..1ec30bc --- /dev/null +++ b/docs/reference/api/usp.objects.rst @@ -0,0 +1,8 @@ +usp.objects +=========== + +.. automodule:: usp.objects + +.. toctree:: + usp.objects.sitemap + usp.objects.page \ No newline at end of file diff --git a/docs/reference/api/usp.objects.sitemap.rst b/docs/reference/api/usp.objects.sitemap.rst new file mode 100644 index 0000000..2fd5dbc --- /dev/null +++ b/docs/reference/api/usp.objects.sitemap.rst @@ -0,0 +1,57 @@ +usp.objects.sitemap +=================== + +.. automodule:: usp.objects.sitemap + +.. autoclass:: AbstractSitemap + :members: + :show-inheritance: + +.. autoclass:: InvalidSitemap + :members: + :show-inheritance: + +Index Sitemaps +-------------- + +.. autoclass:: AbstractIndexSitemap + :members: + :show-inheritance: + :inherited-members: + +.. autoclass:: IndexWebsiteSitemap + :members: + :show-inheritance: + +.. autoclass:: IndexXMLSitemap + :members: + :show-inheritance: + +.. autoclass:: IndexRobotsTxtSitemap + :members: + :show-inheritance: + +Page Sitemaps +------------- + +.. autoclass:: AbstractPagesSitemap + :members: + :show-inheritance: + :inherited-members: + :undoc-members: + +.. autoclass:: PagesXMLSitemap + :members: + :show-inheritance: + +.. autoclass:: PagesTextSitemap + :members: + :show-inheritance: + +.. autoclass:: PagesRSSSitemap + :members: + :show-inheritance: + +.. autoclass:: PagesAtomSitemap + :members: + :show-inheritance: diff --git a/docs/reference/api/usp.tree.rst b/docs/reference/api/usp.tree.rst new file mode 100644 index 0000000..463d8f6 --- /dev/null +++ b/docs/reference/api/usp.tree.rst @@ -0,0 +1,7 @@ +usp.tree +======== +.. automodule:: usp.tree + +.. autofunction:: sitemap_tree_for_homepage + +.. autodata:: _UNPUBLISHED_SITEMAP_PATHS diff --git a/docs/reference/api/usp.web_client.abstract_client.rst b/docs/reference/api/usp.web_client.abstract_client.rst new file mode 100644 index 0000000..a4c41f4 --- /dev/null +++ b/docs/reference/api/usp.web_client.abstract_client.rst @@ -0,0 +1,21 @@ +usp.web_client.abstract_client +============================== + +.. automodule:: usp.web_client.abstract_client + +.. autoclass:: AbstractWebClient + :members: + +.. autodata:: RETRYABLE_HTTP_STATUS_CODES + +.. autoclass:: AbstractWebClientResponse + :members: + +.. autoclass:: AbstractWebClientSuccessResponse + :members: + :show-inheritance: + +.. autoclass:: WebClientErrorResponse + :members: + :show-inheritance: + diff --git a/docs/reference/api/usp.web_client.requests_client.rst b/docs/reference/api/usp.web_client.requests_client.rst new file mode 100644 index 0000000..19f9229 --- /dev/null +++ b/docs/reference/api/usp.web_client.requests_client.rst @@ -0,0 +1,19 @@ +usp.web_client.requests_client +============================== + +.. automodule:: usp.web_client.requests_client + +.. autoclass:: RequestsWebClient + :members: + :show-inheritance: + :inherited-members: + +.. autoclass:: RequestsWebClientErrorResponse + :members: + :inherited-members: + :show-inheritance: + +.. autoclass:: RequestsWebClientSuccessResponse + :members: + :show-inheritance: + :inherited-members: diff --git a/docs/reference/api/usp.web_client.rst b/docs/reference/api/usp.web_client.rst new file mode 100644 index 0000000..f5aee79 --- /dev/null +++ b/docs/reference/api/usp.web_client.rst @@ -0,0 +1,6 @@ +usp.web_client +============== + +.. toctree:: + usp.web_client.abstract_client + usp.web_client.requests_client \ No newline at end of file diff --git a/docs/reference/cli.rst b/docs/reference/cli.rst new file mode 100644 index 0000000..2ea9ffd --- /dev/null +++ b/docs/reference/cli.rst @@ -0,0 +1,69 @@ +.. _cli: + +CLI Reference +------------- + +The CLI provides a simple command-line interface to retrieve sitemap data. + +``usp`` +======= + +.. code-block:: none + + usage: usp [-h] [-v] ... + + Ultimate Sitemap Parser + + options: + -h, --help show this help message and exit + -v, --version show program's version number and exit + + commands: + + ls List sitemap pages + +``usp ls`` +========== + +.. code-block:: none + + usage: usp ls [-h] [-f] [-r] [-k] [-u] url + + download, parse and list the sitemap structure + + positional arguments: + url URL of the site including protocol + + options: + -h, --help show this help message and exit + -f , --format set output format (default: tabtree) + choices: + tabtree: Sitemaps and pages, nested with tab indentation + pages: Flat list of pages, one per line + -r, --no-robots don't discover sitemaps through robots.txt + -k, --no-known don't discover sitemaps through well-known URLs + -u, --strip-url strip the supplied URL from each page and sitemap URL + +.. rubric:: Examples + +.. code-block:: shell-session + + $ usp ls https://example.org/ + https://example.org/ + https://example.org/robots.txt + https://example.org/sitemap.xml + https://example.org/page1.html + + +.. code-block:: shell-session + + $ usp ls https://example.org/ --strip-url + https://example.org/ + /robots.txt + /sitemap.xml + /page1.html + +.. code-block:: shell-session + + $ usp ls https://example.org/ --format pages + https://example.org/page1.html \ No newline at end of file diff --git a/docs/reference/formats.rst b/docs/reference/formats.rst new file mode 100644 index 0000000..63ee65d --- /dev/null +++ b/docs/reference/formats.rst @@ -0,0 +1,227 @@ +Supported Formats Reference +--------------------------- + +Overview of Parsers +=================== + +.. table:: + + +-----------------------+-------------------------------------------------------+-----------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | Format | Index | Pages | + | +-------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------------------------------------+ + | | Parser | Object | Parser | Object | + +=======================+=======================================================+=====================================================+==================================================+================================================+ + | Website [*]_ | | :class:`~usp.objects.sitemap.IndexWebsiteSitemap` | | | + +-----------------------+-------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------------------------------------+ + | `Robots.txt`_ | :class:`~usp.fetch_parse.IndexRobotsTxtSitemapParser` | :class:`~usp.objects.sitemap.IndexRobotsTxtSitemap` | | | + +-----------------------+-------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------------------------------------+ + | `Plain text`_ | | | :class:`~usp.fetch_parse.PlainTextSitemapParser` | :class:`~usp.objects.sitemap.PagesTextSitemap` | + +-----------------------+-------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------------------------------------+ + | XML | :class:`~usp.fetch_parse.XMLSitemapParser` | | :class:`~usp.fetch_parse.XMLSitemapParser` | | + +-----+-----------------+-------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------------------------------------+ + | | `Sitemap`_ | :class:`~usp.fetch_parse.IndexXMLSitemapParser` | :class:`~usp.objects.sitemap.IndexXMLSitemap` | :class:`~usp.fetch_parse.PagesXMLSitemapParser` | :class:`~usp.objects.sitemap.PagesXMLSitemap` | + | +-----------------+-------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------------------------------------+ + | | `RSS 2.0`_ | | | :class:`~usp.fetch_parse.PagesRSSSitemapParser` | :class:`~usp.objects.sitemap.PagesRSSSitemap` | + | +-----------------+-------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------------------------------------+ + | | `Atom 0.3/1.0`_ | | | :class:`~usp.fetch_parse.PagesAtomSitemapParser` | :class:`~usp.objects.sitemap.PagesAtomSitemap` | + +-----+-----------------+-------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------------------------------------+ + | Unknown | | :class:`~usp.objects.sitemap.InvalidSitemap` | | :class:`~usp.objects.sitemap.InvalidSitemap` | + +-----------------------+-------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------------------------------------+ + +.. [*] Represents the root of the website to allow for robots.txt and path-discovered sitemaps. + + +Robots.txt +========== + +.. dropdown:: Example + :class-container: flush + + .. literalinclude:: formats_examples/robots.txt + :language: text + +- `RFC 9309`_ +- `Google documentation `__ +- `Google library implementation `__ + +The robots.txt parser implements the same logic to detect ``Sitemap`` keys as Google's parser, namely it is case insensitive and supports ``Sitemap`` or ``Site-map``. + +Plain Text +========== + +.. dropdown:: Example + :class-container: flush + + .. literalinclude:: formats_examples/plaintext.txt + :language: text + +- `Sitemaps.org specification `__ +- `Google documentation `__ + +The plain text parser reads the file line by line and considers anything that appears to be a useful URL a page. Specifically, it looks for lines that appear to be URLs, can be parsed successfully by :func:`python:urllib.parse.urlparse`, and have the HTTP or HTTPS protocol and has a non-empty hostname. This means that non-URLs in the file will simply be ignored, which is more permissive than the either standard. + +.. _Sitemap: + +XML Sitemap +=========== + +.. dropdown:: Examples + :class-container: flush + + .. tab-set:: + + .. tab-item:: Index + + .. literalinclude:: formats_examples/simple-index.xml + :language: xml + + .. tab-item:: URL Set + + .. literalinclude:: formats_examples/simple-urlset.xml + :language: xml + +- `Sitemaps.org specification `__ +- `Google documentation `__ + +Sitemaps XML (not to be confused with other sitemap formats that happen to be in XML) is the most common kind of sitemap. + +The Sitemaps XML parser supports both the Sitemap and Sitemap index formats. + +Supports the following non-standard features: + +- Truncated files (perhaps because the web server timed out while serving the file) will be parsed as much as possible +- Any unexpected tags are ignored +- Timestamps are :ref:`parsed flexibly ` + +.. note:: + + Namespaces must be declared to parse the sitemap and any extensions correctly. Any unrecognised namespaces will be ignored. + +.. _xml sitemap extensions: + +XML Sitemap Extensions +^^^^^^^^^^^^^^^^^^^^^^ + +- `Google documentation on combining sitemap extensions `__ + +.. note:: + + Only the Google News extension is supported currently. Other extensions (e.g. `Google Image`_ and `Google Video`_) are not currently supported, and only the standard part of the sitemap will be parsed. + +.. _Google Image: https://developers.google.com/search/docs/crawling-indexing/sitemaps/image-sitemaps +.. _Google Video: https://developers.google.com/search/docs/crawling-indexing/sitemaps/video-sitemaps + + +.. _google-news-ext: + +Google News +""""""""""" + +- `Google documentation `__ + + +.. dropdown:: Example + :class-container: flush + + .. literalinclude:: formats_examples/google-news.xml + :emphasize-lines: 3,8-14,20-26 + :language: xml + + +The Google News extension provides additional information to describe the news story which a webpage represents, in addition to the page itself. + +If the page contains Google News data, it is stored as a :class:`~usp.objects.page.SitemapNewsStory` object in :attr:`SitemapPage.news_story `. + +.. _xml date: + +Date Time Parsing +^^^^^^^^^^^^^^^^^ + +It is relatively common for sitemaps to not correctly follow the `W3C Datetime`_ format (a subset of `ISO 8601`_). To handle this, date times are parsed flexibly with fallbacks. This is done in two steps to allow the faster, more reliable parser to be used where possible. + +First, an attempt is made with a full ISO 8601 parser: + +- In Python ≥ 3.11, :meth:`datetime.fromisoformat() ` is tried first. +- In older versions [#dtvers]_, :meth:`dateutil:dateutil.parser.isoparse` is used + +If this is unsuccessful, :meth:`dateutil:dateutil.parser.parse` is tried, which is able to parse most standard forms of date, but is slower and is more likely to mis-parse. + +Without trying the optimised parser first, in large sitemaps, datetime parsing would take a significant proportion of the total runtime. + +RSS 2.0 +======= + +.. dropdown:: Example + :class-container: flush + + .. literalinclude:: formats_examples/rss2.0.xml + :language: xml + +- `RSS 2.0 specification `__ +- `Sitemaps.org specification `__ +- `Google documentation `__ + +Implementation details: + +- Per the specification, ```` elements without a ```` or ``<description>`` are invalid and ignored. +- Although the specification states ``<link>`` is optional, we ignore an ``<item>`` if it does not contain one +- Dates are parsed flexibly + +.. note:: + + `mRSS <https://www.rssboard.org/media-rss>`_ is not currently supported and will be ignored. + +.. _rss date: + +Date Time Parsing +^^^^^^^^^^^^^^^^^ + +It is relatively common for feeds to not correctly follow the `RFC 2822`_ format. To handle this, date times are parsed with :meth:`dateutil:dateutil.parser.parse`, which is able to parse most standard forms of date. Given that feeds should be short, the performance impact of this is minimal. + + +Atom 0.3/1.0 +============ + +.. dropdown:: Examples + :class-container: flush + + .. tab-set:: + + .. tab-item:: Atom 0.3 + + .. literalinclude:: formats_examples/atom0.3.xml + :language: xml + + .. tab-item:: Atom 1.0 + + .. literalinclude:: formats_examples/atom1.0.xml + :language: xml + +- `Atom 0.3 specification <https://web.archive.org/web/20060811235523/http://www.mnot.net/drafts/draft-nottingham-atom-format-02.html>`__ +- `Atom 1.0 specification <https://www.rfc-editor.org/rfc/rfc4287.html>`__ +- `Moving from Atom 0.3 to 1.0 <https://web.archive.org/web/20090717114706/http://rakaz.nl/2005/07/moving-from-atom-03-to-10.html>`__ by Niels Leenheer +- `Google documentation <https://developers.google.com/search/docs/crawling-indexing/sitemaps/build-sitemap#rss>`__ + +Implementation details: + +- The same parser is used for 0.3 and 1.0, and it does not attempt to detect the version, therefore it can accept invalid feeds which are a mixture of both +- Dates are parsed flexibly + +.. _atom date: + +Date Time Parsing +^^^^^^^^^^^^^^^^^ + +Atom 0.3 follows the `W3C Datetime`_ (a subset of `ISO 8601`_) format, and Atom 1.0 follows `RFC 3339`_ (which is similar but not entirely equivalent to ISO 8601 [#3339-8601-diff]_). In either case, :meth:`dateutil:dateutil.parser.parse` is used to parse the date, which is able to parse most standard forms of date. Given that feeds should be short, the performance impact of this is minimal. + + +.. rubric:: Footnotes + +.. [#dtvers] Prior to Python 3.11, :meth:`datetime.fromisoformat() <python:datetime.datetime.fromisoformat>` could only parse times in the specific ISO 8601 format emitted by :meth:`datetime.isoformat() <python:datetime.datetime.isoformat>` so is unsuitable as a general parser. +.. [#3339-8601-diff] See `this page <https://ijmacd.github.io/rfc3339-iso8601/>`_ for some examples. + +.. _W3C Datetime: https://www.w3.org/TR/NOTE-datetime +.. _ISO 8601: https://en.wikipedia.org/wiki/ISO_8601 +.. _RFC 3339: https://www.rfc-editor.org/rfc/rfc3339.html +.. _RFC 2822: https://www.rfc-editor.org/rfc/rfc2822.html#page-14 +.. _RFC 9309: https://www.rfc-editor.org/rfc/rfc9309.html \ No newline at end of file diff --git a/docs/reference/formats_examples/atom0.3.xml b/docs/reference/formats_examples/atom0.3.xml new file mode 100644 index 0000000..967cf41 --- /dev/null +++ b/docs/reference/formats_examples/atom0.3.xml @@ -0,0 +1,18 @@ +<?xml version="1.0" encoding="UTF-8"?> +<feed version="0.3" xmlns="http://purl.org/atom/ns#"> + <title>Example + + 2024-01-01 + + Page 1 + + https://example.org/page1 + 2024-01-01 + + + Page 2 + + https://example.org/page2 + 2024-01-02 + + \ No newline at end of file diff --git a/docs/reference/formats_examples/atom1.0.xml b/docs/reference/formats_examples/atom1.0.xml new file mode 100644 index 0000000..4f35803 --- /dev/null +++ b/docs/reference/formats_examples/atom1.0.xml @@ -0,0 +1,18 @@ + + + Example + + 2024-01-01 + + Page 1 + + https://example.org/page1 + 2024-01-01 + + + Page 2 + + https://example.org/page2 + 2024-01-02 + + \ No newline at end of file diff --git a/docs/reference/formats_examples/google-news.xml b/docs/reference/formats_examples/google-news.xml new file mode 100644 index 0000000..7f433a7 --- /dev/null +++ b/docs/reference/formats_examples/google-news.xml @@ -0,0 +1,28 @@ + + + + https://example.org/page1 + 2024-01-01 + + + + Example.org News + + 2024-01-01 + News Article One + + + + https://example.org/page2 + 2024-01-02 + + + + Example.org News + + 2024-01-02 + News Article Two + + + \ No newline at end of file diff --git a/docs/reference/formats_examples/plaintext.txt b/docs/reference/formats_examples/plaintext.txt new file mode 100644 index 0000000..3e2c98c --- /dev/null +++ b/docs/reference/formats_examples/plaintext.txt @@ -0,0 +1,4 @@ +Yes, there are plain text sitemaps and they could just contain random text. + +https://example.org/page1 +https://example.org/page2 \ No newline at end of file diff --git a/docs/reference/formats_examples/robots.txt b/docs/reference/formats_examples/robots.txt new file mode 100644 index 0000000..5980741 --- /dev/null +++ b/docs/reference/formats_examples/robots.txt @@ -0,0 +1,13 @@ +# An example robots.txt file +# Some sitemap declarations, Google allows all of these +Sitemap: https://example.org/sitemap.xml +sitemap: https://example.org/weirdly-named-sitemap.xml +site-map: https://example.org/another-sitemap.xml +sItE-mAp: https://example.org/and-another-sitemap.xml + +# And other stuff too +User-agent: * +Disallow: /admin/ + +User-agent: GPTBot +Disallow: / \ No newline at end of file diff --git a/docs/reference/formats_examples/rss2.0.xml b/docs/reference/formats_examples/rss2.0.xml new file mode 100644 index 0000000..21d702a --- /dev/null +++ b/docs/reference/formats_examples/rss2.0.xml @@ -0,0 +1,18 @@ + + + + Example + https://example.org/ + Example + + Page 1 + https://example.org/page1 + 2024-01-01 + + + Page 2 + https://example.org/page2 + 2024-01-02 + + + \ No newline at end of file diff --git a/docs/reference/formats_examples/simple-index.xml b/docs/reference/formats_examples/simple-index.xml new file mode 100644 index 0000000..bb7e101 --- /dev/null +++ b/docs/reference/formats_examples/simple-index.xml @@ -0,0 +1,11 @@ + + + + https://example.org/sitemap1.xml + 2024-01-01 + + + https://example.org/sitemap2.xml + 2024-01-02 + + \ No newline at end of file diff --git a/docs/reference/formats_examples/simple-urlset.xml b/docs/reference/formats_examples/simple-urlset.xml new file mode 100644 index 0000000..dd96ad5 --- /dev/null +++ b/docs/reference/formats_examples/simple-urlset.xml @@ -0,0 +1,11 @@ + + + + https://example.org/page1 + 2024-01-01 + + + https://example.org/page2 + 2024-01-02 + + \ No newline at end of file diff --git a/docs/usp.objects.rst b/docs/usp.objects.rst deleted file mode 100644 index 01ca8a3..0000000 --- a/docs/usp.objects.rst +++ /dev/null @@ -1,30 +0,0 @@ -usp.objects package -======================= - -Submodules ----------- - -usp.objects.page module ---------------------------------------- - -.. automodule:: usp.objects.page - :members: - :undoc-members: - :show-inheritance: - -usp.objects.sitemap module ---------------------------------------- - -.. automodule:: usp.objects.sitemap - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: usp.objects - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/usp.rst b/docs/usp.rst deleted file mode 100644 index 0073b4f..0000000 --- a/docs/usp.rst +++ /dev/null @@ -1,38 +0,0 @@ -usp package -=========== - -Subpackages ------------ - -.. toctree:: - - usp.objects - usp.web_client - -Submodules ----------- - -usp.exceptions module ---------------------- - -.. automodule:: usp.exceptions - :members: - :undoc-members: - :show-inheritance: - -usp.tree module ---------------- - -.. automodule:: usp.tree - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: usp - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/usp.web_client.rst b/docs/usp.web_client.rst deleted file mode 100644 index aea7280..0000000 --- a/docs/usp.web_client.rst +++ /dev/null @@ -1,30 +0,0 @@ -usp.web\_client package -======================= - -Submodules ----------- - -usp.web\_client.abstract\_client module ---------------------------------------- - -.. automodule:: usp.web_client.abstract_client - :members: - :undoc-members: - :show-inheritance: - -usp.web\_client.requests\_client module ---------------------------------------- - -.. automodule:: usp.web_client.requests_client - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: usp.web_client - :members: - :undoc-members: - :show-inheritance: diff --git a/poetry.lock b/poetry.lock index 9973c30..b920318 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,14 +1,82 @@ # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +[[package]] +name = "alabaster" +version = "1.0.0" +description = "A light, configurable Sphinx theme" +optional = false +python-versions = ">=3.10" +files = [ + {file = "alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b"}, + {file = "alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e"}, +] + +[[package]] +name = "anyio" +version = "4.4.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.8" +files = [ + {file = "anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7"}, + {file = "anyio-4.4.0.tar.gz", hash = "sha256:5aadc6a1bbb7cdb0bede386cac5e2940f5e2ff3aa20277e991cf028e0585ce94"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (>=0.23)"] + +[[package]] +name = "babel" +version = "2.16.0" +description = "Internationalization utilities" +optional = false +python-versions = ">=3.8" +files = [ + {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"}, + {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"}, +] + +[package.extras] +dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] + +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "certifi" -version = "2024.7.4" +version = "2024.8.30" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, - {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, + {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, + {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, ] [[package]] @@ -110,6 +178,20 @@ files = [ {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, ] +[[package]] +name = "click" +version = "8.1.7" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" @@ -121,6 +203,17 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "docutils" +version = "0.21.2" +description = "Docutils -- Python Documentation Utilities" +optional = false +python-versions = ">=3.9" +files = [ + {file = "docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2"}, + {file = "docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f"}, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -135,15 +228,54 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "furo" +version = "2024.8.6" +description = "A clean customisable Sphinx documentation theme." +optional = false +python-versions = ">=3.8" +files = [ + {file = "furo-2024.8.6-py3-none-any.whl", hash = "sha256:6cd97c58b47813d3619e63e9081169880fbe331f0ca883c871ff1f3f11814f5c"}, + {file = "furo-2024.8.6.tar.gz", hash = "sha256:b63e4cee8abfc3136d3bc03a3d45a76a850bada4d6374d24c1716b0e01394a01"}, +] + +[package.dependencies] +beautifulsoup4 = "*" +pygments = ">=2.7" +sphinx = ">=6.0,<9.0" +sphinx-basic-ng = ">=1.0.0.beta2" + +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + [[package]] name = "idna" -version = "3.7" +version = "3.8" description = "Internationalized Domain Names in Applications (IDNA)" optional = false -python-versions = ">=3.5" +python-versions = ">=3.6" +files = [ + {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"}, + {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"}, +] + +[[package]] +name = "imagesize" +version = "1.4.1" +description = "Getting image size from png/jpeg/jpeg2000/gif file" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ - {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, - {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, + {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"}, + {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, ] [[package]] @@ -751,13 +883,13 @@ fixture = ["fixtures"] [[package]] name = "rich" -version = "13.7.1" +version = "13.8.0" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.7.0" files = [ - {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"}, - {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"}, + {file = "rich-13.8.0-py3-none-any.whl", hash = "sha256:2e85306a063b9492dffc86278197a60cbece75bcb766022f3436f567cae11bdc"}, + {file = "rich-13.8.0.tar.gz", hash = "sha256:a5ac1f1cd448ade0d59cc3356f7db7a7ccda2c8cbae9c7a90c28ff463d3e91f4"}, ] [package.dependencies] @@ -770,29 +902,29 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "ruff" -version = "0.6.1" +version = "0.6.3" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.6.1-py3-none-linux_armv6l.whl", hash = "sha256:b4bb7de6a24169dc023f992718a9417380301b0c2da0fe85919f47264fb8add9"}, - {file = "ruff-0.6.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:45efaae53b360c81043e311cdec8a7696420b3d3e8935202c2846e7a97d4edae"}, - {file = "ruff-0.6.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:bc60c7d71b732c8fa73cf995efc0c836a2fd8b9810e115be8babb24ae87e0850"}, - {file = "ruff-0.6.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c7477c3b9da822e2db0b4e0b59e61b8a23e87886e727b327e7dcaf06213c5cf"}, - {file = "ruff-0.6.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a0af7ab3f86e3dc9f157a928e08e26c4b40707d0612b01cd577cc84b8905cc9"}, - {file = "ruff-0.6.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:392688dbb50fecf1bf7126731c90c11a9df1c3a4cdc3f481b53e851da5634fa5"}, - {file = "ruff-0.6.1-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5278d3e095ccc8c30430bcc9bc550f778790acc211865520f3041910a28d0024"}, - {file = "ruff-0.6.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fe6d5f65d6f276ee7a0fc50a0cecaccb362d30ef98a110f99cac1c7872df2f18"}, - {file = "ruff-0.6.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2e0dd11e2ae553ee5c92a81731d88a9883af8db7408db47fc81887c1f8b672e"}, - {file = "ruff-0.6.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d812615525a34ecfc07fd93f906ef5b93656be01dfae9a819e31caa6cfe758a1"}, - {file = "ruff-0.6.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:faaa4060f4064c3b7aaaa27328080c932fa142786f8142aff095b42b6a2eb631"}, - {file = "ruff-0.6.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:99d7ae0df47c62729d58765c593ea54c2546d5de213f2af2a19442d50a10cec9"}, - {file = "ruff-0.6.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9eb18dfd7b613eec000e3738b3f0e4398bf0153cb80bfa3e351b3c1c2f6d7b15"}, - {file = "ruff-0.6.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:c62bc04c6723a81e25e71715aa59489f15034d69bf641df88cb38bdc32fd1dbb"}, - {file = "ruff-0.6.1-py3-none-win32.whl", hash = "sha256:9fb4c4e8b83f19c9477a8745e56d2eeef07a7ff50b68a6998f7d9e2e3887bdc4"}, - {file = "ruff-0.6.1-py3-none-win_amd64.whl", hash = "sha256:c2ebfc8f51ef4aca05dad4552bbcf6fe8d1f75b2f6af546cc47cc1c1ca916b5b"}, - {file = "ruff-0.6.1-py3-none-win_arm64.whl", hash = "sha256:3bc81074971b0ffad1bd0c52284b22411f02a11a012082a76ac6da153536e014"}, - {file = "ruff-0.6.1.tar.gz", hash = "sha256:af3ffd8c6563acb8848d33cd19a69b9bfe943667f0419ca083f8ebe4224a3436"}, + {file = "ruff-0.6.3-py3-none-linux_armv6l.whl", hash = "sha256:97f58fda4e309382ad30ede7f30e2791d70dd29ea17f41970119f55bdb7a45c3"}, + {file = "ruff-0.6.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:3b061e49b5cf3a297b4d1c27ac5587954ccb4ff601160d3d6b2f70b1622194dc"}, + {file = "ruff-0.6.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:34e2824a13bb8c668c71c1760a6ac7d795ccbd8d38ff4a0d8471fdb15de910b1"}, + {file = "ruff-0.6.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bddfbb8d63c460f4b4128b6a506e7052bad4d6f3ff607ebbb41b0aa19c2770d1"}, + {file = "ruff-0.6.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ced3eeb44df75353e08ab3b6a9e113b5f3f996bea48d4f7c027bc528ba87b672"}, + {file = "ruff-0.6.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47021dff5445d549be954eb275156dfd7c37222acc1e8014311badcb9b4ec8c1"}, + {file = "ruff-0.6.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:7d7bd20dc07cebd68cc8bc7b3f5ada6d637f42d947c85264f94b0d1cd9d87384"}, + {file = "ruff-0.6.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:500f166d03fc6d0e61c8e40a3ff853fa8a43d938f5d14c183c612df1b0d6c58a"}, + {file = "ruff-0.6.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42844ff678f9b976366b262fa2d1d1a3fe76f6e145bd92c84e27d172e3c34500"}, + {file = "ruff-0.6.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70452a10eb2d66549de8e75f89ae82462159855e983ddff91bc0bce6511d0470"}, + {file = "ruff-0.6.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:65a533235ed55f767d1fc62193a21cbf9e3329cf26d427b800fdeacfb77d296f"}, + {file = "ruff-0.6.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d2e2c23cef30dc3cbe9cc5d04f2899e7f5e478c40d2e0a633513ad081f7361b5"}, + {file = "ruff-0.6.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d8a136aa7d228975a6aee3dd8bea9b28e2b43e9444aa678fb62aeb1956ff2351"}, + {file = "ruff-0.6.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f92fe93bc72e262b7b3f2bba9879897e2d58a989b4714ba6a5a7273e842ad2f8"}, + {file = "ruff-0.6.3-py3-none-win32.whl", hash = "sha256:7a62d3b5b0d7f9143d94893f8ba43aa5a5c51a0ffc4a401aa97a81ed76930521"}, + {file = "ruff-0.6.3-py3-none-win_amd64.whl", hash = "sha256:746af39356fee2b89aada06c7376e1aa274a23493d7016059c3a72e3b296befb"}, + {file = "ruff-0.6.3-py3-none-win_arm64.whl", hash = "sha256:14a9528a8b70ccc7a847637c29e56fd1f9183a9db743bbc5b8e0c4ad60592a82"}, + {file = "ruff-0.6.3.tar.gz", hash = "sha256:183b99e9edd1ef63be34a3b51fee0a9f4ab95add123dbf89a71f7b1f0c991983"}, ] [[package]] @@ -806,6 +938,281 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "sniffio" +version = "1.3.1" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, +] + +[[package]] +name = "snowballstemmer" +version = "2.2.0" +description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." +optional = false +python-versions = "*" +files = [ + {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, + {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, +] + +[[package]] +name = "soupsieve" +version = "2.6" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, + {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, +] + +[[package]] +name = "sphinx" +version = "8.0.2" +description = "Python documentation generator" +optional = false +python-versions = ">=3.10" +files = [ + {file = "sphinx-8.0.2-py3-none-any.whl", hash = "sha256:56173572ae6c1b9a38911786e206a110c9749116745873feae4f9ce88e59391d"}, + {file = "sphinx-8.0.2.tar.gz", hash = "sha256:0cce1ddcc4fd3532cf1dd283bc7d886758362c5c1de6598696579ce96d8ffa5b"}, +] + +[package.dependencies] +alabaster = ">=0.7.14" +babel = ">=2.13" +colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""} +docutils = ">=0.20,<0.22" +imagesize = ">=1.3" +Jinja2 = ">=3.1" +packaging = ">=23.0" +Pygments = ">=2.17" +requests = ">=2.30.0" +snowballstemmer = ">=2.2" +sphinxcontrib-applehelp = "*" +sphinxcontrib-devhelp = "*" +sphinxcontrib-htmlhelp = ">=2.0.0" +sphinxcontrib-jsmath = "*" +sphinxcontrib-qthelp = "*" +sphinxcontrib-serializinghtml = ">=1.1.9" +tomli = {version = ">=2", markers = "python_version < \"3.11\""} + +[package.extras] +docs = ["sphinxcontrib-websupport"] +lint = ["flake8 (>=6.0)", "mypy (==1.11.0)", "pytest (>=6.0)", "ruff (==0.5.5)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-Pillow (==10.2.0.20240520)", "types-Pygments (==2.18.0.20240506)", "types-colorama (==0.4.15.20240311)", "types-defusedxml (==0.7.0.20240218)", "types-docutils (==0.21.0.20240724)", "types-requests (>=2.30.0)"] +test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools (>=70.0)", "typing_extensions (>=4.9)"] + +[[package]] +name = "sphinx-autobuild" +version = "2024.9.3" +description = "Rebuild Sphinx documentation on changes, with hot reloading in the browser." +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinx_autobuild-2024.9.3-py3-none-any.whl", hash = "sha256:55fe9bcc05dab659650d79bed0e6beb8b6032234edbf23f028f2cac3471f0c2d"}, + {file = "sphinx_autobuild-2024.9.3.tar.gz", hash = "sha256:75929a5a92b932da8d29837406d6d973a927c456f30986a27f1f20b067897892"}, +] + +[package.dependencies] +colorama = ">=0.4.6" +sphinx = "*" +starlette = ">=0.35" +uvicorn = ">=0.25" +watchfiles = ">=0.20" +websockets = ">=11" + +[package.extras] +test = ["httpx", "pytest (>=6)"] + +[[package]] +name = "sphinx-basic-ng" +version = "1.0.0b2" +description = "A modern skeleton for Sphinx themes." +optional = false +python-versions = ">=3.7" +files = [ + {file = "sphinx_basic_ng-1.0.0b2-py3-none-any.whl", hash = "sha256:eb09aedbabfb650607e9b4b68c9d240b90b1e1be221d6ad71d61c52e29f7932b"}, + {file = "sphinx_basic_ng-1.0.0b2.tar.gz", hash = "sha256:9ec55a47c90c8c002b5960c57492ec3021f5193cb26cebc2dc4ea226848651c9"}, +] + +[package.dependencies] +sphinx = ">=4.0" + +[package.extras] +docs = ["furo", "ipython", "myst-parser", "sphinx-copybutton", "sphinx-inline-tabs"] + +[[package]] +name = "sphinx-copybutton" +version = "0.5.2" +description = "Add a copy button to each of your code cells." +optional = false +python-versions = ">=3.7" +files = [ + {file = "sphinx-copybutton-0.5.2.tar.gz", hash = "sha256:4cf17c82fb9646d1bc9ca92ac280813a3b605d8c421225fd9913154103ee1fbd"}, + {file = "sphinx_copybutton-0.5.2-py3-none-any.whl", hash = "sha256:fb543fd386d917746c9a2c50360c7905b605726b9355cd26e9974857afeae06e"}, +] + +[package.dependencies] +sphinx = ">=1.8" + +[package.extras] +code-style = ["pre-commit (==2.12.1)"] +rtd = ["ipython", "myst-nb", "sphinx", "sphinx-book-theme", "sphinx-examples"] + +[[package]] +name = "sphinx-design" +version = "0.6.1" +description = "A sphinx extension for designing beautiful, view size responsive web components." +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinx_design-0.6.1-py3-none-any.whl", hash = "sha256:b11f37db1a802a183d61b159d9a202314d4d2fe29c163437001324fe2f19549c"}, + {file = "sphinx_design-0.6.1.tar.gz", hash = "sha256:b44eea3719386d04d765c1a8257caca2b3e6f8421d7b3a5e742c0fd45f84e632"}, +] + +[package.dependencies] +sphinx = ">=6,<9" + +[package.extras] +code-style = ["pre-commit (>=3,<4)"] +rtd = ["myst-parser (>=2,<4)"] +testing = ["defusedxml", "myst-parser (>=2,<4)", "pytest (>=8.3,<9.0)", "pytest-cov", "pytest-regressions"] +testing-no-myst = ["defusedxml", "pytest (>=8.3,<9.0)", "pytest-cov", "pytest-regressions"] +theme-furo = ["furo (>=2024.7.18,<2024.8.0)"] +theme-im = ["sphinx-immaterial (>=0.12.2,<0.13.0)"] +theme-pydata = ["pydata-sphinx-theme (>=0.15.2,<0.16.0)"] +theme-rtd = ["sphinx-rtd-theme (>=2.0,<3.0)"] +theme-sbt = ["sphinx-book-theme (>=1.1,<2.0)"] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5"}, + {file = "sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp documents" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2"}, + {file = "sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8"}, + {file = "sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["html5lib", "pytest"] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +description = "A sphinx extension which renders display math in HTML via JavaScript" +optional = false +python-versions = ">=3.5" +files = [ + {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, + {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, +] + +[package.extras] +test = ["flake8", "mypy", "pytest"] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp documents" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb"}, + {file = "sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["defusedxml (>=0.7.1)", "pytest"] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331"}, + {file = "sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxext-opengraph" +version = "0.9.1" +description = "Sphinx Extension to enable OGP support" +optional = false +python-versions = ">=3.8" +files = [ + {file = "sphinxext-opengraph-0.9.1.tar.gz", hash = "sha256:dd2868a1e7c9497977fbbf44cc0844a42af39ca65fe1bb0272518af225d06fc5"}, + {file = "sphinxext_opengraph-0.9.1-py3-none-any.whl", hash = "sha256:b3b230cc6a5b5189139df937f0d9c7b23c7c204493b22646273687969dcb760e"}, +] + +[package.dependencies] +sphinx = ">=4.0" + +[[package]] +name = "starlette" +version = "0.38.4" +description = "The little ASGI library that shines." +optional = false +python-versions = ">=3.8" +files = [ + {file = "starlette-0.38.4-py3-none-any.whl", hash = "sha256:526f53a77f0e43b85f583438aee1a940fd84f8fd610353e8b0c1a77ad8a87e76"}, + {file = "starlette-0.38.4.tar.gz", hash = "sha256:53a7439060304a208fea17ed407e998f46da5e5d9b1addfea3040094512a6379"}, +] + +[package.dependencies] +anyio = ">=3.4.0,<5" + +[package.extras] +full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"] + [[package]] name = "textual" version = "0.73.0" @@ -863,13 +1270,13 @@ test = ["coverage", "pytest", "pytest-cov"] [[package]] name = "urllib3" -version = "1.26.19" +version = "1.26.20" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, - {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, + {file = "urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e"}, + {file = "urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32"}, ] [package.extras] @@ -894,6 +1301,25 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "uvicorn" +version = "0.30.6" +description = "The lightning-fast ASGI server." +optional = false +python-versions = ">=3.8" +files = [ + {file = "uvicorn-0.30.6-py3-none-any.whl", hash = "sha256:65fd46fe3fda5bdc1b03b94eb634923ff18cd35b2f084813ea79d1f103f711b5"}, + {file = "uvicorn-0.30.6.tar.gz", hash = "sha256:4b15decdda1e72be08209e860a1e10e92439ad5b97cf44cc945fcbee66fc5788"}, +] + +[package.dependencies] +click = ">=7.0" +h11 = ">=0.8" +typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} + +[package.extras] +standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] + [[package]] name = "vcrpy" version = "6.0.1" @@ -914,6 +1340,196 @@ yarl = "*" [package.extras] tests = ["Werkzeug (==2.0.3)", "aiohttp", "boto3", "httplib2", "httpx", "pytest", "pytest-aiohttp", "pytest-asyncio", "pytest-cov", "pytest-httpbin", "requests (>=2.22.0)", "tornado", "urllib3"] +[[package]] +name = "watchfiles" +version = "0.24.0" +description = "Simple, modern and high performance file watching and code reload in python." +optional = false +python-versions = ">=3.8" +files = [ + {file = "watchfiles-0.24.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:083dc77dbdeef09fa44bb0f4d1df571d2e12d8a8f985dccde71ac3ac9ac067a0"}, + {file = "watchfiles-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e94e98c7cb94cfa6e071d401ea3342767f28eb5a06a58fafdc0d2a4974f4f35c"}, + {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82ae557a8c037c42a6ef26c494d0631cacca040934b101d001100ed93d43f361"}, + {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:acbfa31e315a8f14fe33e3542cbcafc55703b8f5dcbb7c1eecd30f141df50db3"}, + {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b74fdffce9dfcf2dc296dec8743e5b0332d15df19ae464f0e249aa871fc1c571"}, + {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:449f43f49c8ddca87c6b3980c9284cab6bd1f5c9d9a2b00012adaaccd5e7decd"}, + {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4abf4ad269856618f82dee296ac66b0cd1d71450fc3c98532d93798e73399b7a"}, + {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f895d785eb6164678ff4bb5cc60c5996b3ee6df3edb28dcdeba86a13ea0465e"}, + {file = "watchfiles-0.24.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7ae3e208b31be8ce7f4c2c0034f33406dd24fbce3467f77223d10cd86778471c"}, + {file = "watchfiles-0.24.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2efec17819b0046dde35d13fb8ac7a3ad877af41ae4640f4109d9154ed30a188"}, + {file = "watchfiles-0.24.0-cp310-none-win32.whl", hash = "sha256:6bdcfa3cd6fdbdd1a068a52820f46a815401cbc2cb187dd006cb076675e7b735"}, + {file = "watchfiles-0.24.0-cp310-none-win_amd64.whl", hash = "sha256:54ca90a9ae6597ae6dc00e7ed0a040ef723f84ec517d3e7ce13e63e4bc82fa04"}, + {file = "watchfiles-0.24.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:bdcd5538e27f188dd3c804b4a8d5f52a7fc7f87e7fd6b374b8e36a4ca03db428"}, + {file = "watchfiles-0.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2dadf8a8014fde6addfd3c379e6ed1a981c8f0a48292d662e27cabfe4239c83c"}, + {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6509ed3f467b79d95fc62a98229f79b1a60d1b93f101e1c61d10c95a46a84f43"}, + {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8360f7314a070c30e4c976b183d1d8d1585a4a50c5cb603f431cebcbb4f66327"}, + {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:316449aefacf40147a9efaf3bd7c9bdd35aaba9ac5d708bd1eb5763c9a02bef5"}, + {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73bde715f940bea845a95247ea3e5eb17769ba1010efdc938ffcb967c634fa61"}, + {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3770e260b18e7f4e576edca4c0a639f704088602e0bc921c5c2e721e3acb8d15"}, + {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa0fd7248cf533c259e59dc593a60973a73e881162b1a2f73360547132742823"}, + {file = "watchfiles-0.24.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d7a2e3b7f5703ffbd500dabdefcbc9eafeff4b9444bbdd5d83d79eedf8428fab"}, + {file = "watchfiles-0.24.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d831ee0a50946d24a53821819b2327d5751b0c938b12c0653ea5be7dea9c82ec"}, + {file = "watchfiles-0.24.0-cp311-none-win32.whl", hash = "sha256:49d617df841a63b4445790a254013aea2120357ccacbed00253f9c2b5dc24e2d"}, + {file = "watchfiles-0.24.0-cp311-none-win_amd64.whl", hash = "sha256:d3dcb774e3568477275cc76554b5a565024b8ba3a0322f77c246bc7111c5bb9c"}, + {file = "watchfiles-0.24.0-cp311-none-win_arm64.whl", hash = "sha256:9301c689051a4857d5b10777da23fafb8e8e921bcf3abe6448a058d27fb67633"}, + {file = "watchfiles-0.24.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7211b463695d1e995ca3feb38b69227e46dbd03947172585ecb0588f19b0d87a"}, + {file = "watchfiles-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4b8693502d1967b00f2fb82fc1e744df128ba22f530e15b763c8d82baee15370"}, + {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdab9555053399318b953a1fe1f586e945bc8d635ce9d05e617fd9fe3a4687d6"}, + {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:34e19e56d68b0dad5cff62273107cf5d9fbaf9d75c46277aa5d803b3ef8a9e9b"}, + {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:41face41f036fee09eba33a5b53a73e9a43d5cb2c53dad8e61fa6c9f91b5a51e"}, + {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5148c2f1ea043db13ce9b0c28456e18ecc8f14f41325aa624314095b6aa2e9ea"}, + {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e4bd963a935aaf40b625c2499f3f4f6bbd0c3776f6d3bc7c853d04824ff1c9f"}, + {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c79d7719d027b7a42817c5d96461a99b6a49979c143839fc37aa5748c322f234"}, + {file = "watchfiles-0.24.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:32aa53a9a63b7f01ed32e316e354e81e9da0e6267435c7243bf8ae0f10b428ef"}, + {file = "watchfiles-0.24.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ce72dba6a20e39a0c628258b5c308779b8697f7676c254a845715e2a1039b968"}, + {file = "watchfiles-0.24.0-cp312-none-win32.whl", hash = "sha256:d9018153cf57fc302a2a34cb7564870b859ed9a732d16b41a9b5cb2ebed2d444"}, + {file = "watchfiles-0.24.0-cp312-none-win_amd64.whl", hash = "sha256:551ec3ee2a3ac9cbcf48a4ec76e42c2ef938a7e905a35b42a1267fa4b1645896"}, + {file = "watchfiles-0.24.0-cp312-none-win_arm64.whl", hash = "sha256:b52a65e4ea43c6d149c5f8ddb0bef8d4a1e779b77591a458a893eb416624a418"}, + {file = "watchfiles-0.24.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:3d2e3ab79a1771c530233cadfd277fcc762656d50836c77abb2e5e72b88e3a48"}, + {file = "watchfiles-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:327763da824817b38ad125dcd97595f942d720d32d879f6c4ddf843e3da3fe90"}, + {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd82010f8ab451dabe36054a1622870166a67cf3fce894f68895db6f74bbdc94"}, + {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d64ba08db72e5dfd5c33be1e1e687d5e4fcce09219e8aee893a4862034081d4e"}, + {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1cf1f6dd7825053f3d98f6d33f6464ebdd9ee95acd74ba2c34e183086900a827"}, + {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:43e3e37c15a8b6fe00c1bce2473cfa8eb3484bbeecf3aefbf259227e487a03df"}, + {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88bcd4d0fe1d8ff43675360a72def210ebad3f3f72cabfeac08d825d2639b4ab"}, + {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:999928c6434372fde16c8f27143d3e97201160b48a614071261701615a2a156f"}, + {file = "watchfiles-0.24.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:30bbd525c3262fd9f4b1865cb8d88e21161366561cd7c9e1194819e0a33ea86b"}, + {file = "watchfiles-0.24.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:edf71b01dec9f766fb285b73930f95f730bb0943500ba0566ae234b5c1618c18"}, + {file = "watchfiles-0.24.0-cp313-none-win32.whl", hash = "sha256:f4c96283fca3ee09fb044f02156d9570d156698bc3734252175a38f0e8975f07"}, + {file = "watchfiles-0.24.0-cp313-none-win_amd64.whl", hash = "sha256:a974231b4fdd1bb7f62064a0565a6b107d27d21d9acb50c484d2cdba515b9366"}, + {file = "watchfiles-0.24.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:ee82c98bed9d97cd2f53bdb035e619309a098ea53ce525833e26b93f673bc318"}, + {file = "watchfiles-0.24.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fd92bbaa2ecdb7864b7600dcdb6f2f1db6e0346ed425fbd01085be04c63f0b05"}, + {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f83df90191d67af5a831da3a33dd7628b02a95450e168785586ed51e6d28943c"}, + {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fca9433a45f18b7c779d2bae7beeec4f740d28b788b117a48368d95a3233ed83"}, + {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b995bfa6bf01a9e09b884077a6d37070464b529d8682d7691c2d3b540d357a0c"}, + {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ed9aba6e01ff6f2e8285e5aa4154e2970068fe0fc0998c4380d0e6278222269b"}, + {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5171ef898299c657685306d8e1478a45e9303ddcd8ac5fed5bd52ad4ae0b69b"}, + {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4933a508d2f78099162da473841c652ad0de892719043d3f07cc83b33dfd9d91"}, + {file = "watchfiles-0.24.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:95cf3b95ea665ab03f5a54765fa41abf0529dbaf372c3b83d91ad2cfa695779b"}, + {file = "watchfiles-0.24.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:01def80eb62bd5db99a798d5e1f5f940ca0a05986dcfae21d833af7a46f7ee22"}, + {file = "watchfiles-0.24.0-cp38-none-win32.whl", hash = "sha256:4d28cea3c976499475f5b7a2fec6b3a36208656963c1a856d328aeae056fc5c1"}, + {file = "watchfiles-0.24.0-cp38-none-win_amd64.whl", hash = "sha256:21ab23fdc1208086d99ad3f69c231ba265628014d4aed31d4e8746bd59e88cd1"}, + {file = "watchfiles-0.24.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b665caeeda58625c3946ad7308fbd88a086ee51ccb706307e5b1fa91556ac886"}, + {file = "watchfiles-0.24.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5c51749f3e4e269231510da426ce4a44beb98db2dce9097225c338f815b05d4f"}, + {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82b2509f08761f29a0fdad35f7e1638b8ab1adfa2666d41b794090361fb8b855"}, + {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a60e2bf9dc6afe7f743e7c9b149d1fdd6dbf35153c78fe3a14ae1a9aee3d98b"}, + {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7d9b87c4c55e3ea8881dfcbf6d61ea6775fffed1fedffaa60bd047d3c08c430"}, + {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:78470906a6be5199524641f538bd2c56bb809cd4bf29a566a75051610bc982c3"}, + {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07cdef0c84c03375f4e24642ef8d8178e533596b229d32d2bbd69e5128ede02a"}, + {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d337193bbf3e45171c8025e291530fb7548a93c45253897cd764a6a71c937ed9"}, + {file = "watchfiles-0.24.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ec39698c45b11d9694a1b635a70946a5bad066b593af863460a8e600f0dff1ca"}, + {file = "watchfiles-0.24.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2e28d91ef48eab0afb939fa446d8ebe77e2f7593f5f463fd2bb2b14132f95b6e"}, + {file = "watchfiles-0.24.0-cp39-none-win32.whl", hash = "sha256:7138eff8baa883aeaa074359daabb8b6c1e73ffe69d5accdc907d62e50b1c0da"}, + {file = "watchfiles-0.24.0-cp39-none-win_amd64.whl", hash = "sha256:b3ef2c69c655db63deb96b3c3e587084612f9b1fa983df5e0c3379d41307467f"}, + {file = "watchfiles-0.24.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:632676574429bee8c26be8af52af20e0c718cc7f5f67f3fb658c71928ccd4f7f"}, + {file = "watchfiles-0.24.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a2a9891723a735d3e2540651184be6fd5b96880c08ffe1a98bae5017e65b544b"}, + {file = "watchfiles-0.24.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7fa2bc0efef3e209a8199fd111b8969fe9db9c711acc46636686331eda7dd4"}, + {file = "watchfiles-0.24.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01550ccf1d0aed6ea375ef259706af76ad009ef5b0203a3a4cce0f6024f9b68a"}, + {file = "watchfiles-0.24.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:96619302d4374de5e2345b2b622dc481257a99431277662c30f606f3e22f42be"}, + {file = "watchfiles-0.24.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:85d5f0c7771dcc7a26c7a27145059b6bb0ce06e4e751ed76cdf123d7039b60b5"}, + {file = "watchfiles-0.24.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:951088d12d339690a92cef2ec5d3cfd957692834c72ffd570ea76a6790222777"}, + {file = "watchfiles-0.24.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49fb58bcaa343fedc6a9e91f90195b20ccb3135447dc9e4e2570c3a39565853e"}, + {file = "watchfiles-0.24.0.tar.gz", hash = "sha256:afb72325b74fa7a428c009c1b8be4b4d7c2afedafb2982827ef2156646df2fe1"}, +] + +[package.dependencies] +anyio = ">=3.0.0" + +[[package]] +name = "websockets" +version = "13.0.1" +description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "websockets-13.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1841c9082a3ba4a05ea824cf6d99570a6a2d8849ef0db16e9c826acb28089e8f"}, + {file = "websockets-13.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c5870b4a11b77e4caa3937142b650fbbc0914a3e07a0cf3131f35c0587489c1c"}, + {file = "websockets-13.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f1d3d1f2eb79fe7b0fb02e599b2bf76a7619c79300fc55f0b5e2d382881d4f7f"}, + {file = "websockets-13.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15c7d62ee071fa94a2fc52c2b472fed4af258d43f9030479d9c4a2de885fd543"}, + {file = "websockets-13.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6724b554b70d6195ba19650fef5759ef11346f946c07dbbe390e039bcaa7cc3d"}, + {file = "websockets-13.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56a952fa2ae57a42ba7951e6b2605e08a24801a4931b5644dfc68939e041bc7f"}, + {file = "websockets-13.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:17118647c0ea14796364299e942c330d72acc4b248e07e639d34b75067b3cdd8"}, + {file = "websockets-13.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:64a11aae1de4c178fa653b07d90f2fb1a2ed31919a5ea2361a38760192e1858b"}, + {file = "websockets-13.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0617fd0b1d14309c7eab6ba5deae8a7179959861846cbc5cb528a7531c249448"}, + {file = "websockets-13.0.1-cp310-cp310-win32.whl", hash = "sha256:11f9976ecbc530248cf162e359a92f37b7b282de88d1d194f2167b5e7ad80ce3"}, + {file = "websockets-13.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:c3c493d0e5141ec055a7d6809a28ac2b88d5b878bb22df8c621ebe79a61123d0"}, + {file = "websockets-13.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:699ba9dd6a926f82a277063603fc8d586b89f4cb128efc353b749b641fcddda7"}, + {file = "websockets-13.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cf2fae6d85e5dc384bf846f8243ddaa9197f3a1a70044f59399af001fd1f51d4"}, + {file = "websockets-13.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:52aed6ef21a0f1a2a5e310fb5c42d7555e9c5855476bbd7173c3aa3d8a0302f2"}, + {file = "websockets-13.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8eb2b9a318542153674c6e377eb8cb9ca0fc011c04475110d3477862f15d29f0"}, + {file = "websockets-13.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5df891c86fe68b2c38da55b7aea7095beca105933c697d719f3f45f4220a5e0e"}, + {file = "websockets-13.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fac2d146ff30d9dd2fcf917e5d147db037a5c573f0446c564f16f1f94cf87462"}, + {file = "websockets-13.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b8ac5b46fd798bbbf2ac6620e0437c36a202b08e1f827832c4bf050da081b501"}, + {file = "websockets-13.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:46af561eba6f9b0848b2c9d2427086cabadf14e0abdd9fde9d72d447df268418"}, + {file = "websockets-13.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b5a06d7f60bc2fc378a333978470dfc4e1415ee52f5f0fce4f7853eb10c1e9df"}, + {file = "websockets-13.0.1-cp311-cp311-win32.whl", hash = "sha256:556e70e4f69be1082e6ef26dcb70efcd08d1850f5d6c5f4f2bcb4e397e68f01f"}, + {file = "websockets-13.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:67494e95d6565bf395476e9d040037ff69c8b3fa356a886b21d8422ad86ae075"}, + {file = "websockets-13.0.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f9c9e258e3d5efe199ec23903f5da0eeaad58cf6fccb3547b74fd4750e5ac47a"}, + {file = "websockets-13.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6b41a1b3b561f1cba8321fb32987552a024a8f67f0d05f06fcf29f0090a1b956"}, + {file = "websockets-13.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f73e676a46b0fe9426612ce8caeca54c9073191a77c3e9d5c94697aef99296af"}, + {file = "websockets-13.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f613289f4a94142f914aafad6c6c87903de78eae1e140fa769a7385fb232fdf"}, + {file = "websockets-13.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f52504023b1480d458adf496dc1c9e9811df4ba4752f0bc1f89ae92f4f07d0c"}, + {file = "websockets-13.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:139add0f98206cb74109faf3611b7783ceafc928529c62b389917a037d4cfdf4"}, + {file = "websockets-13.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:47236c13be337ef36546004ce8c5580f4b1150d9538b27bf8a5ad8edf23ccfab"}, + {file = "websockets-13.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c44ca9ade59b2e376612df34e837013e2b273e6c92d7ed6636d0556b6f4db93d"}, + {file = "websockets-13.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9bbc525f4be3e51b89b2a700f5746c2a6907d2e2ef4513a8daafc98198b92237"}, + {file = "websockets-13.0.1-cp312-cp312-win32.whl", hash = "sha256:3624fd8664f2577cf8de996db3250662e259bfbc870dd8ebdcf5d7c6ac0b5185"}, + {file = "websockets-13.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0513c727fb8adffa6d9bf4a4463b2bade0186cbd8c3604ae5540fae18a90cb99"}, + {file = "websockets-13.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:1ee4cc030a4bdab482a37462dbf3ffb7e09334d01dd37d1063be1136a0d825fa"}, + {file = "websockets-13.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dbb0b697cc0655719522406c059eae233abaa3243821cfdfab1215d02ac10231"}, + {file = "websockets-13.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:acbebec8cb3d4df6e2488fbf34702cbc37fc39ac7abf9449392cefb3305562e9"}, + {file = "websockets-13.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63848cdb6fcc0bf09d4a155464c46c64ffdb5807ede4fb251da2c2692559ce75"}, + {file = "websockets-13.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:872afa52a9f4c414d6955c365b6588bc4401272c629ff8321a55f44e3f62b553"}, + {file = "websockets-13.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05e70fec7c54aad4d71eae8e8cab50525e899791fc389ec6f77b95312e4e9920"}, + {file = "websockets-13.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e82db3756ccb66266504f5a3de05ac6b32f287faacff72462612120074103329"}, + {file = "websockets-13.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4e85f46ce287f5c52438bb3703d86162263afccf034a5ef13dbe4318e98d86e7"}, + {file = "websockets-13.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f3fea72e4e6edb983908f0db373ae0732b275628901d909c382aae3b592589f2"}, + {file = "websockets-13.0.1-cp313-cp313-win32.whl", hash = "sha256:254ecf35572fca01a9f789a1d0f543898e222f7b69ecd7d5381d8d8047627bdb"}, + {file = "websockets-13.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:ca48914cdd9f2ccd94deab5bcb5ac98025a5ddce98881e5cce762854a5de330b"}, + {file = "websockets-13.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b74593e9acf18ea5469c3edaa6b27fa7ecf97b30e9dabd5a94c4c940637ab96e"}, + {file = "websockets-13.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:132511bfd42e77d152c919147078460c88a795af16b50e42a0bd14f0ad71ddd2"}, + {file = "websockets-13.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:165bedf13556f985a2aa064309baa01462aa79bf6112fbd068ae38993a0e1f1b"}, + {file = "websockets-13.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e801ca2f448850685417d723ec70298feff3ce4ff687c6f20922c7474b4746ae"}, + {file = "websockets-13.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30d3a1f041360f029765d8704eae606781e673e8918e6b2c792e0775de51352f"}, + {file = "websockets-13.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67648f5e50231b5a7f6d83b32f9c525e319f0ddc841be0de64f24928cd75a603"}, + {file = "websockets-13.0.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:4f0426d51c8f0926a4879390f53c7f5a855e42d68df95fff6032c82c888b5f36"}, + {file = "websockets-13.0.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ef48e4137e8799998a343706531e656fdec6797b80efd029117edacb74b0a10a"}, + {file = "websockets-13.0.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:249aab278810bee585cd0d4de2f08cfd67eed4fc75bde623be163798ed4db2eb"}, + {file = "websockets-13.0.1-cp38-cp38-win32.whl", hash = "sha256:06c0a667e466fcb56a0886d924b5f29a7f0886199102f0a0e1c60a02a3751cb4"}, + {file = "websockets-13.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1f3cf6d6ec1142412d4535adabc6bd72a63f5f148c43fe559f06298bc21953c9"}, + {file = "websockets-13.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1fa082ea38d5de51dd409434edc27c0dcbd5fed2b09b9be982deb6f0508d25bc"}, + {file = "websockets-13.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4a365bcb7be554e6e1f9f3ed64016e67e2fa03d7b027a33e436aecf194febb63"}, + {file = "websockets-13.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:10a0dc7242215d794fb1918f69c6bb235f1f627aaf19e77f05336d147fce7c37"}, + {file = "websockets-13.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59197afd478545b1f73367620407b0083303569c5f2d043afe5363676f2697c9"}, + {file = "websockets-13.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d20516990d8ad557b5abeb48127b8b779b0b7e6771a265fa3e91767596d7d97"}, + {file = "websockets-13.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1a2e272d067030048e1fe41aa1ec8cfbbaabce733b3d634304fa2b19e5c897f"}, + {file = "websockets-13.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ad327ac80ba7ee61da85383ca8822ff808ab5ada0e4a030d66703cc025b021c4"}, + {file = "websockets-13.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:518f90e6dd089d34eaade01101fd8a990921c3ba18ebbe9b0165b46ebff947f0"}, + {file = "websockets-13.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:68264802399aed6fe9652e89761031acc734fc4c653137a5911c2bfa995d6d6d"}, + {file = "websockets-13.0.1-cp39-cp39-win32.whl", hash = "sha256:a5dc0c42ded1557cc7c3f0240b24129aefbad88af4f09346164349391dea8e58"}, + {file = "websockets-13.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:b448a0690ef43db5ef31b3a0d9aea79043882b4632cfc3eaab20105edecf6097"}, + {file = "websockets-13.0.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:faef9ec6354fe4f9a2c0bbb52fb1ff852effc897e2a4501e25eb3a47cb0a4f89"}, + {file = "websockets-13.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:03d3f9ba172e0a53e37fa4e636b86cc60c3ab2cfee4935e66ed1d7acaa4625ad"}, + {file = "websockets-13.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d450f5a7a35662a9b91a64aefa852f0c0308ee256122f5218a42f1d13577d71e"}, + {file = "websockets-13.0.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f55b36d17ac50aa8a171b771e15fbe1561217510c8768af3d546f56c7576cdc"}, + {file = "websockets-13.0.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14b9c006cac63772b31abbcd3e3abb6228233eec966bf062e89e7fa7ae0b7333"}, + {file = "websockets-13.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b79915a1179a91f6c5f04ece1e592e2e8a6bd245a0e45d12fd56b2b59e559a32"}, + {file = "websockets-13.0.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f40de079779acbcdbb6ed4c65af9f018f8b77c5ec4e17a4b737c05c2db554491"}, + {file = "websockets-13.0.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:80e4ba642fc87fa532bac07e5ed7e19d56940b6af6a8c61d4429be48718a380f"}, + {file = "websockets-13.0.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a02b0161c43cc9e0232711eff846569fad6ec836a7acab16b3cf97b2344c060"}, + {file = "websockets-13.0.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6aa74a45d4cdc028561a7d6ab3272c8b3018e23723100b12e58be9dfa5a24491"}, + {file = "websockets-13.0.1-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00fd961943b6c10ee6f0b1130753e50ac5dcd906130dcd77b0003c3ab797d026"}, + {file = "websockets-13.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:d93572720d781331fb10d3da9ca1067817d84ad1e7c31466e9f5e59965618096"}, + {file = "websockets-13.0.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:71e6e5a3a3728886caee9ab8752e8113670936a193284be9d6ad2176a137f376"}, + {file = "websockets-13.0.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c4a6343e3b0714e80da0b0893543bf9a5b5fa71b846ae640e56e9abc6fbc4c83"}, + {file = "websockets-13.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a678532018e435396e37422a95e3ab87f75028ac79570ad11f5bf23cd2a7d8c"}, + {file = "websockets-13.0.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6716c087e4aa0b9260c4e579bb82e068f84faddb9bfba9906cb87726fa2e870"}, + {file = "websockets-13.0.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e33505534f3f673270dd67f81e73550b11de5b538c56fe04435d63c02c3f26b5"}, + {file = "websockets-13.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:acab3539a027a85d568c2573291e864333ec9d912675107d6efceb7e2be5d980"}, + {file = "websockets-13.0.1-py3-none-any.whl", hash = "sha256:b80f0c51681c517604152eb6a572f5a9378f877763231fddb883ba2f968e8817"}, + {file = "websockets-13.0.1.tar.gz", hash = "sha256:4d6ece65099411cfd9a48d13701d7438d9c34f479046b34c50ff60bb8834e43e"}, +] + [[package]] name = "wrapt" version = "1.16.0" @@ -995,101 +1611,103 @@ files = [ [[package]] name = "yarl" -version = "1.9.4" +version = "1.9.7" description = "Yet another URL library" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"}, - {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"}, - {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"}, - {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"}, - {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"}, - {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"}, - {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"}, - {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"}, - {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"}, - {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"}, - {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"}, - {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"}, - {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"}, - {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"}, - {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"}, - {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"}, - {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"}, - {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"}, - {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"}, - {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"}, - {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"}, - {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"}, - {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"}, - {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"}, - {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"}, - {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"}, - {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"}, - {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"}, - {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"}, - {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"}, + {file = "yarl-1.9.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:60c04415b31a1611ef5989a6084dd6f6b95652c6a18378b58985667b65b2ecb6"}, + {file = "yarl-1.9.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1787dcfdbe730207acb454548a6e19f80ae75e6d2d1f531c5a777bc1ab6f7952"}, + {file = "yarl-1.9.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f5ddad20363f9f1bbedc95789c897da62f939e6bc855793c3060ef8b9f9407bf"}, + {file = "yarl-1.9.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdb156a06208fc9645ae7cc0fca45c40dd40d7a8c4db626e542525489ca81a9"}, + {file = "yarl-1.9.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:522fa3d300d898402ae4e0fa7c2c21311248ca43827dc362a667de87fdb4f1be"}, + {file = "yarl-1.9.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7f9cabfb8b980791b97a3ae3eab2e38b2ba5eab1af9b7495bdc44e1ce7c89e3"}, + {file = "yarl-1.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fc728857df4087da6544fc68f62d7017fa68d74201d5b878e18ed4822c31fb3"}, + {file = "yarl-1.9.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dba2ebac677184d56374fa3e452b461f5d6a03aa132745e648ae8859361eb6b"}, + {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a95167ae34667c5cc7d9206c024f793e8ffbadfb307d5c059de470345de58a21"}, + {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9d319ac113ca47352319cbea92d1925a37cb7bd61a8c2f3e3cd2e96eb33cccae"}, + {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:2d71a5d818d82586ac46265ae01466e0bda0638760f18b21f1174e0dd58a9d2f"}, + {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ff03f1c1ac474c66d474929ae7e4dd195592c1c7cc8c36418528ed81b1ca0a79"}, + {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:78250f635f221dde97d02c57aade3313310469bc291888dfe32acd1012594441"}, + {file = "yarl-1.9.7-cp310-cp310-win32.whl", hash = "sha256:f3aaf9fa960d55bd7876d55d7ea3cc046f3660df1ff73fc1b8c520a741ed1f21"}, + {file = "yarl-1.9.7-cp310-cp310-win_amd64.whl", hash = "sha256:e8362c941e07fbcde851597672a5e41b21dc292b7d5a1dc439b7a93c9a1af5d9"}, + {file = "yarl-1.9.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:596069ddeaf72b5eb36cd714dcd2b5751d0090d05a8d65113b582ed9e1c801fb"}, + {file = "yarl-1.9.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cb870907e8b86b2f32541403da9455afc1e535ce483e579bea0e6e79a0cc751c"}, + {file = "yarl-1.9.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ca5e86be84492fa403c4dcd4dcaf8e1b1c4ffc747b5176f7c3d09878c45719b0"}, + {file = "yarl-1.9.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a99cecfb51c84d00132db909e83ae388793ca86e48df7ae57f1be0beab0dcce5"}, + {file = "yarl-1.9.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25508739e9b44d251172145f54c084b71747b09e4d237dc2abb045f46c36a66e"}, + {file = "yarl-1.9.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:60f3b5aec3146b6992640592856414870f5b20eb688c1f1d5f7ac010a7f86561"}, + {file = "yarl-1.9.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1557456afce5db3d655b5f8a31cdcaae1f47e57958760525c44b76e812b4987"}, + {file = "yarl-1.9.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:71bb1435a84688ed831220c5305d96161beb65cac4a966374475348aa3de4575"}, + {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f87d8645a7a806ec8f66aac5e3b1dcb5014849ff53ffe2a1f0b86ca813f534c7"}, + {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:58e3f01673873b8573da3abe138debc63e4e68541b2104a55df4c10c129513a4"}, + {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8af0bbd4d84f8abdd9b11be9488e32c76b1501889b73c9e2292a15fb925b378b"}, + {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:7fc441408ed0d9c6d2d627a02e281c21f5de43eb5209c16636a17fc704f7d0f8"}, + {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a9552367dc440870556da47bb289a806f08ad06fbc4054072d193d9e5dd619ba"}, + {file = "yarl-1.9.7-cp311-cp311-win32.whl", hash = "sha256:628619008680a11d07243391271b46f07f13b75deb9fe92ef342305058c70722"}, + {file = "yarl-1.9.7-cp311-cp311-win_amd64.whl", hash = "sha256:bc23d870864971c8455cfba17498ccefa53a5719ea9f5fce5e7e9c1606b5755f"}, + {file = "yarl-1.9.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d8cf3d0b67996edc11957aece3fbce4c224d0451c7c3d6154ec3a35d0e55f6b"}, + {file = "yarl-1.9.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3a7748cd66fef49c877e59503e0cc76179caf1158d1080228e67e1db14554f08"}, + {file = "yarl-1.9.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a6fa3aeca8efabb0fbbb3b15e0956b0cb77f7d9db67c107503c30af07cd9e00"}, + {file = "yarl-1.9.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf37dd0008e5ac5c3880198976063c491b6a15b288d150d12833248cf2003acb"}, + {file = "yarl-1.9.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87aa5308482f248f8c3bd9311cd6c7dfd98ea1a8e57e35fb11e4adcac3066003"}, + {file = "yarl-1.9.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:867b13c1b361f9ba5d2f84dc5408082f5d744c83f66de45edc2b96793a9c5e48"}, + {file = "yarl-1.9.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48ce93947554c2c85fe97fc4866646ec90840bc1162e4db349b37d692a811755"}, + {file = "yarl-1.9.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fcd3d94b848cba132f39a5b40d80b0847d001a91a6f35a2204505cdd46afe1b2"}, + {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d06d6a8f98dd87646d98f0c468be14b201e47ec6092ad569adf835810ad0dffb"}, + {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:91567ff4fce73d2e7ac67ed5983ad26ba2343bc28cb22e1e1184a9677df98d7c"}, + {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1d5594512541e63188fea640b7f066c218d2176203d6e6f82abf702ae3dca3b2"}, + {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9c2743e43183e4afbb07d5605693299b8756baff0b086c25236c761feb0e3c56"}, + {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:daa69a3a2204355af39f4cfe7f3870d87c53d77a597b5100b97e3faa9460428b"}, + {file = "yarl-1.9.7-cp312-cp312-win32.whl", hash = "sha256:36b16884336c15adf79a4bf1d592e0c1ffdb036a760e36a1361565b66785ec6c"}, + {file = "yarl-1.9.7-cp312-cp312-win_amd64.whl", hash = "sha256:2ead2f87a1174963cc406d18ac93d731fbb190633d3995fa052d10cefae69ed8"}, + {file = "yarl-1.9.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:808eddabcb6f7b2cdb6929b3e021ac824a2c07dc7bc83f7618e18438b1b65781"}, + {file = "yarl-1.9.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:395ab0d8ce6d104a988da429bcbfd445e03fb4c911148dfd523f69d13f772e47"}, + {file = "yarl-1.9.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:49827dfccbd59c4499605c13805e947349295466e490860a855b7c7e82ec9c75"}, + {file = "yarl-1.9.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6b8bbdd425d0978311520ea99fb6c0e9e04e64aee84fac05f3157ace9f81b05"}, + {file = "yarl-1.9.7-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71d33fd1c219b5b28ee98cd76da0c9398a4ed4792fd75c94135237db05ba5ca8"}, + {file = "yarl-1.9.7-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:62440431741d0b7d410e5cbad800885e3289048140a43390ecab4f0b96dde3bb"}, + {file = "yarl-1.9.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4db97210433366dfba55590e48285b89ad0146c52bf248dd0da492dd9f0f72cf"}, + {file = "yarl-1.9.7-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:653597b615809f2e5f4dba6cd805608b6fd3597128361a22cc612cf7c7a4d1bf"}, + {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:df47612129e66f7ce7c9994d4cd4e6852f6e3bf97699375d86991481796eeec8"}, + {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5e338b6febbae6c9fe86924bac3ea9c1944e33255c249543cd82a4af6df6047b"}, + {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e649d37d04665dddb90994bbf0034331b6c14144cc6f3fbce400dc5f28dc05b7"}, + {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:0a1b8fd849567be56342e988e72c9d28bd3c77b9296c38b9b42d2fe4813c9d3f"}, + {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f9d715b2175dff9a49c6dafdc2ab3f04850ba2f3d4a77f69a5a1786b057a9d45"}, + {file = "yarl-1.9.7-cp313-cp313-win32.whl", hash = "sha256:bc9233638b07c2e4a3a14bef70f53983389bffa9e8cb90a2da3f67ac9c5e1842"}, + {file = "yarl-1.9.7-cp313-cp313-win_amd64.whl", hash = "sha256:62e110772330d7116f91e79cd83fef92545cb2f36414c95881477aa01971f75f"}, + {file = "yarl-1.9.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a564155cc2194ecd9c0d8f8dc57059b822a507de5f08120063675eb9540576aa"}, + {file = "yarl-1.9.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:03e917cc44a01e1be60a83ee1a17550b929490aaa5df2a109adc02137bddf06b"}, + {file = "yarl-1.9.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eefda67ba0ba44ab781e34843c266a76f718772b348f7c5d798d8ea55b95517f"}, + {file = "yarl-1.9.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:316c82b499b6df41444db5dea26ee23ece9356e38cea43a8b2af9e6d8a3558e4"}, + {file = "yarl-1.9.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10452727843bc847596b75e30a7fe92d91829f60747301d1bd60363366776b0b"}, + {file = "yarl-1.9.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:050f3e4d886be55728fef268587d061c5ce6f79a82baba71840801b63441c301"}, + {file = "yarl-1.9.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0aabe557446aa615693a82b4d3803c102fd0e7a6a503bf93d744d182a510184"}, + {file = "yarl-1.9.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23404842228e6fa8ace235024519df37f3f8e173620407644d40ddca571ff0f4"}, + {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:34736fcc9d6d7080ebbeb0998ecb91e4f14ad8f18648cf0b3099e2420a225d86"}, + {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:48f7a158f3ca67509d21cb02a96964e4798b6f133691cc0c86cf36e26e26ec8f"}, + {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:6639444d161c693cdabb073baaed1945c717d3982ecedf23a219bc55a242e728"}, + {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:1cd450e10cb53d63962757c3f6f7870be49a3e448c46621d6bd46f8088d532de"}, + {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74d3ef5e81f81507cea04bf5ae22f18ef538607a7c754aac2b6e3029956a2842"}, + {file = "yarl-1.9.7-cp38-cp38-win32.whl", hash = "sha256:4052dbd0c900bece330e3071c636f99dff06e4628461a29b38c6e222a427cf98"}, + {file = "yarl-1.9.7-cp38-cp38-win_amd64.whl", hash = "sha256:dd08da4f2d171e19bd02083c921f1bef89f8f5f87000d0ffc49aa257bc5a9802"}, + {file = "yarl-1.9.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ab906a956d2109c6ea11e24c66592b06336e2743509290117f0f7f47d2c1dd3"}, + {file = "yarl-1.9.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d8ad761493d5aaa7ab2a09736e62b8a220cb0b10ff8ccf6968c861cd8718b915"}, + {file = "yarl-1.9.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d35f9cdab0ec5e20cf6d2bd46456cf599052cf49a1698ef06b9592238d1cf1b1"}, + {file = "yarl-1.9.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a48d2b9f0ae29a456fb766ae461691378ecc6cf159dd9f938507d925607591c3"}, + {file = "yarl-1.9.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cf85599c9336b89b92c313519bcaa223d92fa5d98feb4935a47cce2e8722b4b8"}, + {file = "yarl-1.9.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8e8916b1ff7680b1f2b1608c82dc15c569b9f2cb2da100c747c291f1acf18a14"}, + {file = "yarl-1.9.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29c80890e0a64fb0e5f71350d48da330995073881f8b8e623154aef631febfb0"}, + {file = "yarl-1.9.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9163d21aa40ff8528db2aee2b0b6752efe098055b41ab8e5422b2098457199fe"}, + {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:65e3098969baf221bb45e3b2f60735fc2b154fc95902131ebc604bae4c629ea6"}, + {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:cddebd096effe4be90fd378e4224cd575ac99e1c521598a6900e94959006e02e"}, + {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:8525f955a2dcc281573b6aadeb8ab9c37e2d3428b64ca6a2feec2a794a69c1da"}, + {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:5d585c7d834c13f24c7e3e0efaf1a4b7678866940802e11bd6c4d1f99c935e6b"}, + {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:78805148e780a9ca66f3123e04741e344b66cf06b4fb13223e3a209f39a6da55"}, + {file = "yarl-1.9.7-cp39-cp39-win32.whl", hash = "sha256:3f53df493ec80b76969d6e1ae6e4411a55ab1360e02b80c84bd4b33d61a567ba"}, + {file = "yarl-1.9.7-cp39-cp39-win_amd64.whl", hash = "sha256:c81c28221a85add23a0922a6aeb2cdda7f9723e03e2dfae06fee5c57fe684262"}, + {file = "yarl-1.9.7-py3-none-any.whl", hash = "sha256:49935cc51d272264358962d050d726c3e5603a616f53e52ea88e9df1728aa2ee"}, + {file = "yarl-1.9.7.tar.gz", hash = "sha256:f28e602edeeec01fc96daf7728e8052bc2e12a672e2a138561a1ebaf30fd9df7"}, ] [package.dependencies] @@ -1099,4 +1717,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "3a26c9a4fba04babfcead3a954f3e6aa383db8b6f4f2e3134717340051d536a3" +content-hash = "5549a75f40830d3888d34ab5c12a70141482537d27f7c672c941088aac47d079" diff --git a/pyproject.toml b/pyproject.toml index 5d94298..abb18ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,16 @@ optional = true pytest-memray = "^1.7.0" pyinstrument = "^4.7.2" +[tool.poetry.group.docs] +optional = true +[tool.poetry.group.docs.dependencies] +sphinx = { version = "^8.0.2", python = "^3.10" } +sphinx-autobuild = { version = "^2024.04.16", python = "^3.10"} +sphinx-design = {version = "^0.6.1", python = "^3.10"} +furo = {version = "^2024.8.6", python = "^3.10"} +sphinxext-opengraph = {version = "^0.9.1", python = "^3.10"} +sphinx-copybutton = {version = "^0.5.2", python = "^3.10"} + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" From 5bffe5a11dfbd4dbee7cbf56f5220e73c2fed175 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 3 Sep 2024 13:50:54 +0100 Subject: [PATCH 47/79] Add preliminary github actions --- .github/workflows/lint.yml | 30 ++++++ .github/workflows/publish.yml | 89 ++++++++++++++++ .github/workflows/test.yml | 32 ++++++ .github/workflows/test_integration.yml | 45 ++++++++ Makefile | 12 +++ pyproject.toml | 2 + tests/integration/.gitignore | 4 + tests/integration/cassettes/.gitignore | 2 - tests/integration/conftest.py | 27 +++++ tests/integration/download.py | 138 +++++++++++++++++++++++++ tests/integration/test_integration.py | 30 +----- tests/integration/test_mem.py | 16 +++ 12 files changed, 396 insertions(+), 31 deletions(-) create mode 100644 .github/workflows/lint.yml create mode 100644 .github/workflows/publish.yml create mode 100644 .github/workflows/test.yml create mode 100644 .github/workflows/test_integration.yml create mode 100644 tests/integration/.gitignore delete mode 100644 tests/integration/cassettes/.gitignore create mode 100644 tests/integration/download.py create mode 100644 tests/integration/test_mem.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..d967cd5 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,30 @@ +name: Test + +on: [push, pull_request] + +permissions: + contents: read + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Install Poetry + run: pipx install poetry==1.8.3 + - name: Setup Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: "3.8" + cache: "poetry" + - name: Install dependencies + run: poetry install --no-interaction --no-root + - name: Install Project + run: poetry install --no-interaction + - name: Poetry Build + run: poetry run ruff format --check + id: format + - name: Ruff Lint Check + run: poetry run ruff check --output-format=github + if: success() || steps.format.conclusion == 'failure' \ No newline at end of file diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..35eb448 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,89 @@ +name: Push to PyPI + +on: + push: + tags: + - '*' + workflow_dispatch: + +jobs: + build: + name: Build Distribution + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Install Poetry + run: pipx install poetry==1.8.3 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: "3.8" + cache: "poetry" + - name: Install Python dependencies + run: poetry install --no-interaction --no-root + - name: Build + run: poetry build + - name: Store distribution packages + uses: actions/upload-artifact@v3 + with: + name: python-package-distributions + path: dist/ + publish-to-pypi: + name: Publish to PyPI + needs: + - build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/ultimate-sitemap-parser + permissions: + id-token: write + steps: + - name: Download distribution packages + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + github-release: + name: GitHub release + needs: + - publish-to-pypi + runs-on: ubuntu-latest + + permissions: + contents: write + id-token: write + + steps: + - name: Download distribution packages + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Sign the dists with Sigstore + uses: sigstore/gh-action-sigstore-python@v2.1.1 + with: + inputs: >- + ./dist/*.tar.gz + ./dist/*.whl + - name: Create GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release create + '${{ github.ref_name }}' + --repo '${{ github.repository }}' + --notes "" + - name: Upload artifact signatures to GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + # Upload to GitHub Release using the `gh` CLI. + # `dist/` contains the built packages, and the + # sigstore-produced signatures and certificates. + run: >- + gh release upload + '${{ github.ref_name }}' dist/** + --repo '${{ github.repository }}' \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..e03be65 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,32 @@ +name: Test + +on: [push, pull_request] + +permissions: + contents: read + +jobs: + test: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + - name: Install Poetry + run: pipx install poetry==1.8.3 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "poetry" + - name: Install dependencies + run: poetry install --no-interaction --no-root + - name: Install Project + run: poetry install --no-interaction + - name: Poetry Build + run: poetry build + - name: Run tests + run: poetry run pytest \ No newline at end of file diff --git a/.github/workflows/test_integration.yml b/.github/workflows/test_integration.yml new file mode 100644 index 0000000..8795011 --- /dev/null +++ b/.github/workflows/test_integration.yml @@ -0,0 +1,45 @@ +name: Test + +on: [workflow_dispatch] + +permissions: + contents: read + +jobs: + test: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: ["3.8"] + + steps: + - uses: actions/checkout@v4 + - name: Install Poetry + run: pipx install poetry==1.8.3 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "poetry" + - name: Install dependencies + run: poetry install --no-interaction --no-root + - name: Install Project + run: poetry install --no-interaction + - name: Cache cassettes + uses: actions/cache@v4 + with: + path: tests/integration/cassettes + # Always restore this cache as the script takes care of updating + key: usp-cassettes + - name: Download cassettes + run: poetry run python tests/integration/download.py -d + - name: Run integration tests + run: poetry run pytest --integration --durations=0 \ + --junit-xml=$GITHUB_SHA.xml \ + tests/integration/test_integration.py + - name: Upload report + uses: actions/upload-artifact@v4 + with: + path: $GITHUB_SHA.xml + name: junit_report \ No newline at end of file diff --git a/Makefile b/Makefile index 65444ba..7682b1a 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,18 @@ test: poetry run pytest +.PHONY: integ +integ: + poetry run pytest --integration tests/integration --durations 0 + +.PHONY: mem +mem: + poetry run pytest --memray --memray-bin-path memray --integration tests/integration + +.PHONY: prof +prof: + poetry run pyinstrument -m pytest --integration tests/integration + .PHONY: lint lint: poetry run ruff check --fix diff --git a/pyproject.toml b/pyproject.toml index abb18ca..9a7a339 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,5 +72,7 @@ select = [ ] [tool.pytest.ini_options] +junit_suite_name = "ultimate-sitemap-parser" +junit_duration_report = "call" log_cli = true log_cli_level = "WARNING" \ No newline at end of file diff --git a/tests/integration/.gitignore b/tests/integration/.gitignore new file mode 100644 index 0000000..91ec9a1 --- /dev/null +++ b/tests/integration/.gitignore @@ -0,0 +1,4 @@ +cassettes/*.yaml +cassettes/manifest.json +cassettes/hashes.json +cassettes/download/ \ No newline at end of file diff --git a/tests/integration/cassettes/.gitignore b/tests/integration/cassettes/.gitignore deleted file mode 100644 index 5e7a51a..0000000 --- a/tests/integration/cassettes/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.yaml -manifest.json \ No newline at end of file diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c5d5790..395abf6 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,4 +1,8 @@ +import json +from pathlib import Path + import pytest +import vcr def pytest_addoption(parser): @@ -22,3 +26,26 @@ def pytest_collection_modifyitems(config, items): for item in items: if "integration" in item.keywords: item.add_marker(skip_perf) + +def pytest_generate_tests(metafunc): + # cassettes = list(Path(__file__).parent.joinpath('cassettes').glob('*.yaml')) + # cassette_names = [f"integration-{cassette.stem}" for cassette in cassettes] + # metafunc.parametrize('cassette_path', cassettes, ids=cassette_names, indirect=True) + cassettes_root = Path(__file__).parent / "cassettes" + + manifest_path = cassettes_root / "manifest.json" + if not manifest_path.exists(): + return + + manifest = json.loads(manifest_path.read_text()) + cassette_fixtures = [ + (url, cassettes_root / item["name"]) for url, item in manifest.items() + ] + cassette_ids = [f"integration-{url}" for url, _ in cassette_fixtures] + metafunc.parametrize("site_url,cassette_path", cassette_fixtures, ids=cassette_ids) + + +@pytest.fixture +def _with_vcr(cassette_path): + with vcr.use_cassette(cassette_path, record_mode="none"): + yield diff --git a/tests/integration/download.py b/tests/integration/download.py new file mode 100644 index 0000000..4ffea25 --- /dev/null +++ b/tests/integration/download.py @@ -0,0 +1,138 @@ +import argparse +import gzip +import hashlib +import json +import logging +from pathlib import Path +import shutil +import sys + +import requests + +CASSETTE_REPO = "https://github.com/GateNLP/usp-test-cassettes" +MANIFEST_FILE = f"{CASSETTE_REPO}/raw/main/manifest.json" +CASSETTE_ROOT = Path(__file__).parent / "cassettes" + +def download_manifest(): + r = requests.get(MANIFEST_FILE, allow_redirects=True) + r.raise_for_status() + + data = json.loads(r.text) + + with open(CASSETTE_ROOT / "manifest.json", "w") as f: + f.write(r.text) + + return data + +def load_hashes(): + if not (CASSETTE_ROOT / "hashes.json").exists(): + return {} + + with open(CASSETTE_ROOT / "hashes.json") as f: + return json.load(f) + +def find_new(manifest, current_hashes): + to_dl = [] + + for url, data in manifest.items(): + if current_hashes.get(url, {}) != data['hash']: + logging.info(f"{url} is out-of-date") + to_dl.append(url) + + return to_dl + +def calc_hash(path): + with open(path, "rb") as f: + return hashlib.sha256(f.read()).hexdigest() + +def dl_cassette(data): + dl_gz_path = CASSETTE_ROOT / "download" / f"{data['name']}.gz" + logging.info(f"Downloading {data['url']} to {dl_gz_path}") + with requests.get(data["url"], allow_redirects=True, stream=True) as r: + r.raise_for_status() + + with open(dl_gz_path / dl_gz_path, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + cassette_path = CASSETTE_ROOT / data["name"] + dl_hash = calc_hash(dl_gz_path) + + if dl_hash != data["hash"]: + logging.error(f"Downloaded file hash {dl_hash} does not match expected hash {data['hash']}") + exit(1) + + logging.info(f"Download completed, extracting to {cassette_path}") + + with gzip.open(dl_gz_path, 'rb') as f_gz: + with open(cassette_path, 'wb') as f_cassette: + shutil.copyfileobj(f_gz, f_cassette) + + return dl_gz_path, cassette_path + +def update_hashes(current_hashes, url, new_hashes): + current_hashes[url] = new_hashes + + with open(CASSETTE_ROOT / "hashes.json", "w") as f: + json.dump(current_hashes, f, indent=2) + +def cleanup_files(data, confirm=True): + cassettes = CASSETTE_ROOT.glob("*.yaml") + downloads = (CASSETTE_ROOT / "download").glob("*.yaml.gz") + + files = set(list(cassettes) + list(downloads)) + + keep_files = [] + for cassette in data.values(): + keep_files.append(CASSETTE_ROOT / cassette["name"]) + keep_files.append(CASSETTE_ROOT / "download" / f"{cassette['name']}.gz") + keep_files = set(keep_files) + + to_delete = files - keep_files + + if len(to_delete) == 0: + return + + if confirm: + sys.stdout.write(f"{len(to_delete)} files to be deleted:\n") + for file in to_delete: + sys.stdout.write(f"\t{file}\n") + sys.stdout.write("\n\n") + resp = input("Confirm deletion? [y/N] ") + if resp.lower() != "y": + logging.info("Skipped deletion") + return + + logging.info(f"Deleting {len(to_delete)} outdated files") + for file in to_delete: + logging.info(f"Deleting {file}") + file.unlink() + + +def main(force: bool = False, force_delete=False): + logging.basicConfig(level=logging.INFO) + (CASSETTE_ROOT / "download").mkdir(exist_ok=True) + + manifest = download_manifest() + logging.info(f"Downloaded manifest with {len(manifest)} cassettes") + current_hashes = load_hashes() + if force: + to_dl = list(manifest.keys()) + else: + to_dl = find_new(manifest, current_hashes) + logging.info(f"Downloaded {len(to_dl)} cassettes") + + for url in to_dl: + dl_cassette(manifest[url]) + update_hashes(current_hashes, url, manifest[url]["hash"]) + + cleanup_files(manifest, confirm=not force_delete) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-f", "--force", action="store_true", help="Force downloading all cassettes") + parser.add_argument("-d", "--delete", action="store_true", help="Delete unknown cassettes without confirmation") + parser.set_defaults(force=False, delete=False) + args = parser.parse_args() + main(force=args.force, force_delete=args.delete) \ No newline at end of file diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index 75c5206..25a5fb1 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -1,39 +1,11 @@ -import json -from pathlib import Path - import pytest -import vcr from usp.tree import sitemap_tree_for_homepage -def pytest_generate_tests(metafunc): - # cassettes = list(Path(__file__).parent.joinpath('cassettes').glob('*.yaml')) - # cassette_names = [f"integration-{cassette.stem}" for cassette in cassettes] - # metafunc.parametrize('cassette_path', cassettes, ids=cassette_names, indirect=True) - cassettes_root = Path(__file__).parent / "cassettes" - - manifest_path = cassettes_root / "manifest.json" - if not manifest_path.exists(): - return - - manifest = json.loads(manifest_path.read_text()) - cassette_fixtures = [ - (url, cassettes_root / item["name"]) for url, item in manifest.items() - ] - cassette_ids = [f"integration-{url}" for url, _ in cassette_fixtures] - metafunc.parametrize("site_url,cassette_path", cassette_fixtures, ids=cassette_ids) - - -@pytest.fixture -def _with_vcr(cassette_path): - with vcr.use_cassette(cassette_path, record_mode="none"): - yield - - @pytest.mark.usefixtures("_with_vcr") @pytest.mark.integration -def test_integration(site_url, cassette_path): +def test_sitemap_parse(site_url, cassette_path): print(f"Loading {cassette_path}") sitemap = sitemap_tree_for_homepage(site_url) diff --git a/tests/integration/test_mem.py b/tests/integration/test_mem.py new file mode 100644 index 0000000..60be9ba --- /dev/null +++ b/tests/integration/test_mem.py @@ -0,0 +1,16 @@ +""" +This test loads all pages into a list for testing memory consumption. +""" + +import pytest + +from usp.tree import sitemap_tree_for_homepage + + +@pytest.mark.usefixtures("_with_vcr") +@pytest.mark.integration +def test_all_page_size(site_url, cassette_path): + print(f"Loading {cassette_path}") + sitemap = sitemap_tree_for_homepage(site_url) + pages = list(sitemap.all_pages()) + print(f"Site {site_url} has {len(pages)} pages") From 3ef0e6df849e097ec23709a30ea0493fdd275af0 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 3 Sep 2024 13:51:06 +0100 Subject: [PATCH 48/79] Remove old Travis file --- .travis.yml | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 2cc3657..0000000 --- a/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ -language: python -python: -- '3.5' -- '3.6' -- nightly -matrix: - include: - - python: 3.7 - dist: xenial - sudo: true -before_install: -- pip install coverage coveralls -install: -- pip install . -script: -- pip install .[test] -- coverage run --source=usp setup.py test -after_success: -- coveralls -deploy: - provider: pypi - skip_existing: true - user: mediacloud-travis - on: - tags: true - distributions: "sdist bdist_wheel" - password: - secure: d2oQd9ojE8K50uGgjuisonPGEC4NLGVHbAx/IFDCC3K5/oVDeNG7BuIqQdNS0ObFJWH8yjHDcjoq1J1RvRhJlNNWYercm5qN+3ANMePINBt7iCgzcoSA8/MyyKvlId/8VqEnbU1ZD4ou3QBfG5y2AXzrGZSS3qJ7TlT5mt8N31bDAdB2CsR3bRcVjtylu8zPuFarhpnn0X7y/T/jOWhVuO8OI2kd2P+3h9zR88nJVv8xThsCclwHqZns48yDmOHKpAjuSAeexUdnbLNPadSS3ial79WGcnjsnfb5vNTrp3H9dhQLoIJbCXjemwgWiGadOee0HQJDZvdNPJzojw6QXiXASORmVhLV3I1IKa0g+m2HPcGqBKWMloAvVQEd4d9SKH6/lf0unSIOb1UAeMASPsZTw//60pBH8L7SmcwtskJNfAr2RnUDK7P6C/vkwEYiET44DCPzRGzcoaQfp/Cybh9tSxbHpdqkCkW59VeVFA0dWgSrVfywwDbFACky0gHK/YEQK45dzRrUxKTBcYBD3RH2iIjZsMEesTSfjZ9ePT3gUdA4sqsYRASGq8nEqFWY32dqr/4JQulzFffdVQcbnrk/gxk1mRFIjboyx6c0vJaroO+tsNRQSlnMa8PbH6BE02YIRFgHPauLx8XZpOruXYCap6Mr8w6VvWjYH8M41uk= - From 87c7263c298816f4568b44004fcb05b1c7306167 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 4 Sep 2024 12:28:42 +0100 Subject: [PATCH 49/79] Allow datetime helpers to return None (fixes #31, #22) --- tests/test_helpers.py | 9 ++++++++- usp/helpers.py | 21 ++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 27ce468..fcf1ad6 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,5 +1,4 @@ import datetime - import pytest from usp.exceptions import ( @@ -72,6 +71,10 @@ def test_parse_iso8601_date(): tzinfo=datetime.timezone.utc, ) +def test_parse_iso8601_invalid_date(): + # GH#31 + assert parse_iso8601_date("2021-06-18T112:13:04+00:00") is None + assert parse_iso8601_date("not a date") is None def test_parse_rfc2822_date(): assert parse_rfc2822_date("Tue, 10 Aug 2010 20:43:53 -0000") == datetime.datetime( @@ -96,6 +99,10 @@ def test_parse_rfc2822_date(): tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)), ) +def test_parse_rfc2822_date_invalid_date(): + # GH#31 + assert parse_rfc2822_date("Fri, 18 Jun 2021 112:13:04 UTC") is None + assert parse_rfc2822_date("not a date") is None # noinspection SpellCheckingInspection def test_is_http_url(): diff --git a/usp/helpers.py b/usp/helpers.py index 6a610de..9f729f6 100644 --- a/usp/helpers.py +++ b/usp/helpers.py @@ -8,7 +8,6 @@ import time from typing import Optional from urllib.parse import urlparse, unquote_plus, urlunparse - from dateutil.parser import parse as dateutil_parse from dateutil.parser import isoparse as dateutil_isoparse @@ -86,7 +85,7 @@ def html_unescape_strip(string: Optional[str]) -> Optional[str]: return string -def parse_iso8601_date(date_string: str) -> datetime.datetime: +def parse_iso8601_date(date_string: str) -> Optional[datetime.datetime]: """ Parse ISO 8601 date (e.g. from sitemap's ) into datetime.datetime object. @@ -105,25 +104,29 @@ def parse_iso8601_date(date_string: str) -> datetime.datetime: # Try the more efficient ISO 8601 parser return dateutil_isoparse(date_string) except ValueError: - # Try the less efficient general parser + pass + + # Try the less efficient general parser + try: return dateutil_parse(date_string) + except ValueError: + return None -def parse_rfc2822_date(date_string: str) -> datetime.datetime: +def parse_rfc2822_date(date_string: str) -> Optional[datetime.datetime]: """ Parse RFC 2822 date (e.g. from Atom's ) into datetime.datetime object. :param date_string: RFC 2822 date, e.g. "Tue, 10 Aug 2010 20:43:53 -0000". :return: datetime.datetime object of a parsed date. """ - # FIXME parse known date formats faster - # TODO: fix naming of this function as it shouldn't actually be RFC2822 if not date_string: raise SitemapException("Date string is unset.") - date = dateutil_parse(date_string) - - return date + try: + return dateutil_parse(date_string) + except ValueError: + return None def get_url_retry_on_client_errors( From 411d79491cbbeb0fcb45e2dfa0b27fa42aeaa08f Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 4 Sep 2024 12:28:57 +0100 Subject: [PATCH 50/79] Correct datetime format in docs RSS sample --- docs/reference/formats_examples/rss2.0.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/reference/formats_examples/rss2.0.xml b/docs/reference/formats_examples/rss2.0.xml index 21d702a..fe23731 100644 --- a/docs/reference/formats_examples/rss2.0.xml +++ b/docs/reference/formats_examples/rss2.0.xml @@ -7,12 +7,12 @@ Page 1 https://example.org/page1 - 2024-01-01 + Mon, 01 Jan 2024 12:00:00 UTC Page 2 https://example.org/page2 - 2024-01-02 + Tue, 02 Jan 2024 14:00:00 UTC \ No newline at end of file From 711552ac579497241a739f17cb4b1a3ade5f6cf8 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 4 Sep 2024 12:29:22 +0100 Subject: [PATCH 51/79] Correct datetime parse function used by Atom --- usp/fetch_parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index 8965e03..a792ab0 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -973,7 +973,7 @@ def page(self) -> Optional[SitemapPage]: publication_date = html_unescape_strip(self.publication_date) if publication_date: - publication_date = parse_rfc2822_date(publication_date) + publication_date = parse_iso8601_date(publication_date) return SitemapPage( url=link, From b4f0e1de6437250dc65b5812f49681df393a8e76 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 4 Sep 2024 12:29:41 +0100 Subject: [PATCH 52/79] Add tests for sitemap truncation --- tests/tree/test_edges.py | 88 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/tests/tree/test_edges.py b/tests/tree/test_edges.py index e4ad61e..679f28f 100644 --- a/tests/tree/test_edges.py +++ b/tests/tree/test_edges.py @@ -126,3 +126,91 @@ def test_max_recursion_level_robots(self, requests_mock): tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) sitemaps = list(tree.all_sitemaps()) assert type(sitemaps[-1]) is InvalidSitemap + + def test_truncated_sitemap_missing_close_urlset(self, requests_mock): + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=( + textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + """ + ).strip() + ), + ) + + sitemap_xml = """ + + """ + for x in range(50): + sitemap_xml += f""" + + {self.TEST_BASE_URL}/page_{x}.html + + """ + + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + headers={"Content-Type": "application/xml"}, + text=( + textwrap.dedent(sitemap_xml).strip() + ), + ) + + tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) + assert len(list(tree.all_pages())) == 50 + + def test_truncated_sitemap_mid_url(self, requests_mock): + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=( + textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + """ + ).strip() + ), + ) + + sitemap_xml = """ + + """ + for x in range(49): + sitemap_xml += f""" + + {self.TEST_BASE_URL}/page_{x}.html + + """ + sitemap_xml += f""" + + {self.TEST_BASE_URL}/page_ + """ + + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + headers={"Content-Type": "application/xml"}, + text=( + textwrap.dedent(sitemap_xml).strip() + ), + ) + + tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) + all_pages = list(tree.all_pages()) + assert len(all_pages) == 49 + assert all_pages[-1].url.endswith('page_48.html') \ No newline at end of file From 2cadbc7f932bec2673281cb41996a1c361e9f294 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 4 Sep 2024 13:07:17 +0100 Subject: [PATCH 53/79] Improve web client option docstrings --- usp/web_client/abstract_client.py | 4 ++-- usp/web_client/requests_client.py | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/usp/web_client/abstract_client.py b/usp/web_client/abstract_client.py index 9b2cc7e..f7a2abd 100644 --- a/usp/web_client/abstract_client.py +++ b/usp/web_client/abstract_client.py @@ -144,11 +144,11 @@ class AbstractWebClient(metaclass=abc.ABCMeta): """ @abc.abstractmethod - def set_max_response_data_length(self, max_response_data_length: int) -> None: + def set_max_response_data_length(self, max_response_data_length: Optional[int]) -> None: """ Set the maximum number of bytes that the web client will fetch. - :param max_response_data_length: Maximum number of bytes that the web client will fetch. + :param max_response_data_length: Maximum number of bytes that the web client will fetch, or None to fetch all. """ raise NotImplementedError("Abstract method.") diff --git a/usp/web_client/requests_client.py b/usp/web_client/requests_client.py index 5a9121c..c27f58d 100644 --- a/usp/web_client/requests_client.py +++ b/usp/web_client/requests_client.py @@ -1,7 +1,7 @@ """Implementation of :mod:`usp.web_client.abstract_client` with Requests.""" from http import HTTPStatus -from typing import Optional, Dict +from typing import Optional, Dict, Tuple, Union import requests @@ -89,8 +89,14 @@ def __init__(self, verify=True): self.__proxies = {} self.__verify = verify - def set_timeout(self, timeout: int) -> None: - """Set HTTP request timeout.""" + def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None: + """Set HTTP request timeout. + + See also: `Requests timeout docs `__ + + :param timeout: An integer to use as both the connect and read timeouts, + or a tuple to specify them individually, or None for no timeout + """ # Used mostly for testing self.__timeout = timeout @@ -98,17 +104,13 @@ def set_proxies(self, proxies: Dict[str, str]) -> None: """ Set a proxy for the request. - * keys are schemes, e.g. "http" or "https"; - * values are "scheme://user:password@host:port/". - :param proxies: Proxy definition where the keys are schemes ("http" or "https") and values are the proxy address. - Example: ``{'http': 'http://user:pass@10.10.1.10:3128/'}`` + Example: ``{'http': 'http://user:pass@10.10.1.10:3128/'}, or an empty dict to disable proxy.`` """ # Used mostly for testing self.__proxies = proxies def set_max_response_data_length(self, max_response_data_length: int) -> None: - """Set max response data length.""" self.__max_response_data_length = max_response_data_length def get(self, url: str) -> AbstractWebClientResponse: From 8f6206319cb3410ec589e3e047c6391cdfeefcca Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Thu, 5 Sep 2024 15:04:54 +0100 Subject: [PATCH 54/79] Change recursion test to match #29 --- tests/tree/test_edges.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/tree/test_edges.py b/tests/tree/test_edges.py index 679f28f..8cfc172 100644 --- a/tests/tree/test_edges.py +++ b/tests/tree/test_edges.py @@ -107,7 +107,9 @@ def test_max_recursion_level_xml(self, requests_mock): assert type(sitemaps[-1]) is InvalidSitemap - def test_max_recursion_level_robots(self, requests_mock): + def test_max_recursion_level_sitemap_with_robots(self, requests_mock): + # GH#29 + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) requests_mock.get( self.TEST_BASE_URL + "/robots.txt", @@ -118,11 +120,29 @@ def test_max_recursion_level_robots(self, requests_mock): User-agent: * Disallow: /whatever - Sitemap: {self.TEST_BASE_URL}/robots.txt + Sitemap: {self.TEST_BASE_URL}/sitemap.xml """ ).strip() ), ) + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + headers={"Content-Type": "application/xml"}, + text=( + textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/robots.txt + 2024-01-01 + + + """ + ).strip() + ), + ) + tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) sitemaps = list(tree.all_sitemaps()) assert type(sitemaps[-1]) is InvalidSitemap @@ -160,9 +180,7 @@ def test_truncated_sitemap_missing_close_urlset(self, requests_mock): requests_mock.get( self.TEST_BASE_URL + "/sitemap.xml", headers={"Content-Type": "application/xml"}, - text=( - textwrap.dedent(sitemap_xml).strip() - ), + text=(textwrap.dedent(sitemap_xml).strip()), ) tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) @@ -205,12 +223,10 @@ def test_truncated_sitemap_mid_url(self, requests_mock): requests_mock.get( self.TEST_BASE_URL + "/sitemap.xml", headers={"Content-Type": "application/xml"}, - text=( - textwrap.dedent(sitemap_xml).strip() - ), + text=(textwrap.dedent(sitemap_xml).strip()), ) tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) all_pages = list(tree.all_pages()) assert len(all_pages) == 49 - assert all_pages[-1].url.endswith('page_48.html') \ No newline at end of file + assert all_pages[-1].url.endswith("page_48.html") From 9800b7ee72682da301e74c7c134d4bc966f52695 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Thu, 5 Sep 2024 15:05:39 +0100 Subject: [PATCH 55/79] Ruff --- tests/integration/conftest.py | 1 + tests/integration/download.py | 30 ++++++++++++++++++++------- tests/integration/test_integration.py | 6 ++++-- tests/test_helpers.py | 4 ++++ usp/web_client/abstract_client.py | 4 +++- 5 files changed, 35 insertions(+), 10 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 395abf6..c7fb754 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -27,6 +27,7 @@ def pytest_collection_modifyitems(config, items): if "integration" in item.keywords: item.add_marker(skip_perf) + def pytest_generate_tests(metafunc): # cassettes = list(Path(__file__).parent.joinpath('cassettes').glob('*.yaml')) # cassette_names = [f"integration-{cassette.stem}" for cassette in cassettes] diff --git a/tests/integration/download.py b/tests/integration/download.py index 4ffea25..3521179 100644 --- a/tests/integration/download.py +++ b/tests/integration/download.py @@ -13,6 +13,7 @@ MANIFEST_FILE = f"{CASSETTE_REPO}/raw/main/manifest.json" CASSETTE_ROOT = Path(__file__).parent / "cassettes" + def download_manifest(): r = requests.get(MANIFEST_FILE, allow_redirects=True) r.raise_for_status() @@ -24,6 +25,7 @@ def download_manifest(): return data + def load_hashes(): if not (CASSETTE_ROOT / "hashes.json").exists(): return {} @@ -31,20 +33,23 @@ def load_hashes(): with open(CASSETTE_ROOT / "hashes.json") as f: return json.load(f) + def find_new(manifest, current_hashes): to_dl = [] for url, data in manifest.items(): - if current_hashes.get(url, {}) != data['hash']: + if current_hashes.get(url, {}) != data["hash"]: logging.info(f"{url} is out-of-date") to_dl.append(url) return to_dl + def calc_hash(path): with open(path, "rb") as f: return hashlib.sha256(f.read()).hexdigest() + def dl_cassette(data): dl_gz_path = CASSETTE_ROOT / "download" / f"{data['name']}.gz" logging.info(f"Downloading {data['url']} to {dl_gz_path}") @@ -59,23 +64,27 @@ def dl_cassette(data): dl_hash = calc_hash(dl_gz_path) if dl_hash != data["hash"]: - logging.error(f"Downloaded file hash {dl_hash} does not match expected hash {data['hash']}") + logging.error( + f"Downloaded file hash {dl_hash} does not match expected hash {data['hash']}" + ) exit(1) logging.info(f"Download completed, extracting to {cassette_path}") - with gzip.open(dl_gz_path, 'rb') as f_gz: - with open(cassette_path, 'wb') as f_cassette: + with gzip.open(dl_gz_path, "rb") as f_gz: + with open(cassette_path, "wb") as f_cassette: shutil.copyfileobj(f_gz, f_cassette) return dl_gz_path, cassette_path + def update_hashes(current_hashes, url, new_hashes): current_hashes[url] = new_hashes with open(CASSETTE_ROOT / "hashes.json", "w") as f: json.dump(current_hashes, f, indent=2) + def cleanup_files(data, confirm=True): cassettes = CASSETTE_ROOT.glob("*.yaml") downloads = (CASSETTE_ROOT / "download").glob("*.yaml.gz") @@ -131,8 +140,15 @@ def main(force: bool = False, force_delete=False): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-f", "--force", action="store_true", help="Force downloading all cassettes") - parser.add_argument("-d", "--delete", action="store_true", help="Delete unknown cassettes without confirmation") + parser.add_argument( + "-f", "--force", action="store_true", help="Force downloading all cassettes" + ) + parser.add_argument( + "-d", + "--delete", + action="store_true", + help="Delete unknown cassettes without confirmation", + ) parser.set_defaults(force=False, delete=False) args = parser.parse_args() - main(force=args.force, force_delete=args.delete) \ No newline at end of file + main(force=args.force, force_delete=args.delete) diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index 25a5fb1..6c198ae 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -1,3 +1,5 @@ +import logging + import pytest from usp.tree import sitemap_tree_for_homepage @@ -6,7 +8,7 @@ @pytest.mark.usefixtures("_with_vcr") @pytest.mark.integration def test_sitemap_parse(site_url, cassette_path): - print(f"Loading {cassette_path}") + logging.critical(f"Loading {cassette_path}") sitemap = sitemap_tree_for_homepage(site_url) # Do this over converting to a list() as this will load all pages into memory @@ -14,4 +16,4 @@ def test_sitemap_parse(site_url, cassette_path): page_count = 0 for page in sitemap.all_pages(): page_count += 1 - print(f"Site {site_url} has {page_count} pages") + logging.critical(f"Site {site_url} has {page_count} pages") diff --git a/tests/test_helpers.py b/tests/test_helpers.py index fcf1ad6..1c8d3a1 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -71,11 +71,13 @@ def test_parse_iso8601_date(): tzinfo=datetime.timezone.utc, ) + def test_parse_iso8601_invalid_date(): # GH#31 assert parse_iso8601_date("2021-06-18T112:13:04+00:00") is None assert parse_iso8601_date("not a date") is None + def test_parse_rfc2822_date(): assert parse_rfc2822_date("Tue, 10 Aug 2010 20:43:53 -0000") == datetime.datetime( year=2010, @@ -99,11 +101,13 @@ def test_parse_rfc2822_date(): tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)), ) + def test_parse_rfc2822_date_invalid_date(): # GH#31 assert parse_rfc2822_date("Fri, 18 Jun 2021 112:13:04 UTC") is None assert parse_rfc2822_date("not a date") is None + # noinspection SpellCheckingInspection def test_is_http_url(): # noinspection PyTypeChecker diff --git a/usp/web_client/abstract_client.py b/usp/web_client/abstract_client.py index f7a2abd..d828ddc 100644 --- a/usp/web_client/abstract_client.py +++ b/usp/web_client/abstract_client.py @@ -144,7 +144,9 @@ class AbstractWebClient(metaclass=abc.ABCMeta): """ @abc.abstractmethod - def set_max_response_data_length(self, max_response_data_length: Optional[int]) -> None: + def set_max_response_data_length( + self, max_response_data_length: Optional[int] + ) -> None: """ Set the maximum number of bytes that the web client will fetch. From f78211e818054a6f0968ddfb803e2190b8df9e66 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 6 Sep 2024 12:45:27 +0100 Subject: [PATCH 56/79] Add local parsing support --- docs/changelog.rst | 39 +++++++++-- docs/get-started.rst | 19 +++++ docs/index.rst | 3 +- docs/reference/api/usp.fetch_parse.rst | 4 ++ docs/reference/api/usp.tree.rst | 2 + .../api/usp.web_client.abstract_client.rst | 6 ++ tests/tree/test_from_str.py | 69 +++++++++++++++++++ usp/fetch_parse.py | 62 ++++++++++++++--- usp/tree.py | 14 +++- usp/web_client/abstract_client.py | 21 ++++++ 10 files changed, 220 insertions(+), 19 deletions(-) create mode 100644 tests/tree/test_from_str.py diff --git a/docs/changelog.rst b/docs/changelog.rst index f56ca85..7f6b82e 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,18 +4,45 @@ Changelog v1.0.0 (upcoming) ----------------- -- TODO +**New Features** + +- CLI tool to parse and list sitemaps on the command line (see :doc:`/reference/cli`) +- All sitemap objects now implement a consistent interface, allowing traversal of the tree irrespective of type: + - All sitemaps now have ``pages`` and ``sub_sitemaps`` properties, returning their children of that type, or an empty list where not applicable + - Added ``all_sitemaps()`` method to iterate over all descendant sitemaps +- Pickling page sitemaps now includes page data, which previously was not included as it was swapped to disk +- Sitemaps and pages now implement ``to_dict()`` method to convert to dictionaries +- Added optional arguments to ``usp.tree.sitemap_tree_for_homepage()`` to disable robots.txt-based or known-path-based sitemap discovery. Default behaviour is still to use both. +- Parse sitemaps from a string with :ref:`local parse` + +**Performance** + +Improvement of parse performance by approximately 90%: + +- Optimised lookup of page URLs when checking if duplicate +- Optimised datetime parse in XML Sitemaps by trying full ISO8601 parsers before the general parser + +**Bug Fixes** + +- Invalid datetimes will be parsed as ``None`` instead of crashing (reported in :issue:`22`, :issue:`31`) +- Moved ``__version__`` attribute into main class module +- Robots.txt index sitemaps now count for the max recursion depth (reported in :issue:`29`). The default maximum has been increased by 1 to compensate for this. v0.6 (upcoming) --------------- -- Add proxy support with :meth:`.RequestsWebClient.set_proxies` (:pr:`20` by :user:`tgrandje`) +**New Features** + +- Add proxy support with ``RequestsWebClient.set_proxies()`` (:pr:`20` by :user:`tgrandje`) - Add additional sitemap discovery paths for news sitemaps (:commit:`d3bdaae56be87c97ce2f3f845087f495f6439b44`) -- Resolve warnings caused by :external+python:class:`http.HTTPStatus` usage (:commit:`3867b6e`) -- Don't add :class:`~.InvalidSitemap` object if ``robots.txt`` is not found (:pr:`39` by :user:`gbenson`) -- Add parameter to :meth:`~.RequestsWebClient.__init__` to disable certificate verification (:pr:`37` by :user:`japherwocky`) -- Remove log configuration so it can be specified at application level (:pr:`24` by :user:`dsoprea`) +- Add parameter to ``RequestsWebClient.__init__()`` to disable certificate verification (:pr:`37` by :user:`japherwocky`) +**Bug Fixes** + +- Remove log configuration so it can be specified at application level (:pr:`24` by :user:`dsoprea`) +- Resolve warnings caused by :external+python:class:`http.HTTPStatus` usage (:commit:`3867b6e`) +- Don't add ``InvalidSitemap`` object if ``robots.txt`` is not found (:pr:`39` by :user:`gbenson`) +- Fix incorrect lowercasing of URLS discovered in robots.txt (:pr:`35`) Prior versions -------------- diff --git a/docs/get-started.rst b/docs/get-started.rst index c75e6f6..c14fe39 100644 --- a/docs/get-started.rst +++ b/docs/get-started.rst @@ -36,3 +36,22 @@ This will return a tree representing the structure of the sitemaps. To iterate t This will output the URL of each page in the sitemap, loading the parsed representations of sitemaps `lazily to reduce memory usage `_ in very large sitemaps. Each page is an instance of :class:`~usp.objects.page.SitemapPage`, which will always have at least a URL and priority, and may have other attributes if present. + +.. _local parse: + +Local Parsing +------------- + +USP is primarily designed to fetch live sitemaps from the web, but does support local parsing too: + +.. code-block:: + + from usp.tree import sitemap_from_str + + # Load your sitemap and parse it in + parsed_sitemap = sitemap_from_str("...") + + for page in parsed_sitemap.all_pages(): + print(page.url) + +The returned object will be the appropriate child class of :class:`~.AbstractSitemap`. Page sitemaps will have their pages as above, but in index sitemaps each sub-sitemap will be an :class:`~usp.objects.sitemap.InvalidSitemap` (as it's unable to make a request to fetch them). \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 5483570..8b3dccf 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -87,6 +87,7 @@ USP is very easy to use, with just a single line of code it can traverse and par Advanced Features ----------------- -- :doc:`CLI Client `: Use the ``usp ls`` tool to work with sitemaps from the command line. +- :doc:`CLI Client `: Use the ``usp ls`` tool to work with sitemaps from the command line - :doc:`Serialisation `: Export raw data or save to disk and load later +- :ref:`local parse`: Use USP's sitemap parsers on sitemaps which have already been downloaded - Custom web clients: Instead of the default client built on `requests `_ you can use your own web client by implementing the :class:`~usp.web_client.abstract_client.AbstractWebClient` interface. \ No newline at end of file diff --git a/docs/reference/api/usp.fetch_parse.rst b/docs/reference/api/usp.fetch_parse.rst index 2279a7b..790ff42 100644 --- a/docs/reference/api/usp.fetch_parse.rst +++ b/docs/reference/api/usp.fetch_parse.rst @@ -6,6 +6,10 @@ usp.fetch_parse .. autoclass:: SitemapFetcher :members: +.. autoclass:: SitemapStrParser + :members: + :show-inheritance: + .. autoclass:: AbstractSitemapParser :members: diff --git a/docs/reference/api/usp.tree.rst b/docs/reference/api/usp.tree.rst index 463d8f6..ddf61b9 100644 --- a/docs/reference/api/usp.tree.rst +++ b/docs/reference/api/usp.tree.rst @@ -5,3 +5,5 @@ usp.tree .. autofunction:: sitemap_tree_for_homepage .. autodata:: _UNPUBLISHED_SITEMAP_PATHS + +.. autofunction:: sitemap_from_str diff --git a/docs/reference/api/usp.web_client.abstract_client.rst b/docs/reference/api/usp.web_client.abstract_client.rst index a4c41f4..5e4078a 100644 --- a/docs/reference/api/usp.web_client.abstract_client.rst +++ b/docs/reference/api/usp.web_client.abstract_client.rst @@ -19,3 +19,9 @@ usp.web_client.abstract_client :members: :show-inheritance: +.. autoclass:: LocalWebClient + :members: + :show-inheritance: + +.. autoclass:: NoWebClientException + :show-inheritance: diff --git a/tests/tree/test_from_str.py b/tests/tree/test_from_str.py new file mode 100644 index 0000000..c612bb4 --- /dev/null +++ b/tests/tree/test_from_str.py @@ -0,0 +1,69 @@ +import textwrap + +from tests.tree.base import TreeTestBase +from usp.objects.page import SitemapPage +from usp.objects.sitemap import IndexXMLSitemap, InvalidSitemap, PagesXMLSitemap +from usp.tree import sitemap_from_str + + +class TestSitemapFromStrStr(TreeTestBase): + def test_xml_pages(self): + parsed = sitemap_from_str( + content=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/about.html + {self.TEST_DATE_STR_ISO8601} + monthly + 0.8 + + + {self.TEST_BASE_URL}/contact.html + {self.TEST_DATE_STR_ISO8601} + + + when we feel like it + + + 1.1 + + + + """ + ).strip() + ) + + assert isinstance(parsed, PagesXMLSitemap) + assert len(list(parsed.all_pages())) == 2 + assert all([isinstance(page, SitemapPage) for page in parsed.all_pages()]) + + def test_xml_index(self): + parsed = sitemap_from_str( + content=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/sitemap_news_1.xml + {self.TEST_DATE_STR_ISO8601} + + + {self.TEST_BASE_URL}/sitemap_news_index_2.xml + {self.TEST_DATE_STR_ISO8601} + + + """ + ).strip() + ) + + assert isinstance(parsed, IndexXMLSitemap) + assert len(parsed.sub_sitemaps) == 2 + assert all( + [ + isinstance(sub_sitemap, InvalidSitemap) + for sub_sitemap in parsed.sub_sitemaps + ] + ) + assert parsed.sub_sitemaps[0].url == self.TEST_BASE_URL + "/sitemap_news_1.xml" diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index a792ab0..105968e 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -12,7 +12,7 @@ import xml.parsers.expat from collections import OrderedDict from decimal import Decimal -from typing import Optional, Dict +from typing import Optional, Dict, Union from .exceptions import SitemapException, SitemapXMLParsingException from .helpers import ( @@ -45,6 +45,7 @@ AbstractWebClientSuccessResponse, WebClientErrorResponse, ) +from .web_client.abstract_client import LocalWebClient, NoWebClientException from .web_client.requests_client import RequestsWebClient log = create_logger(__name__) @@ -101,6 +102,19 @@ def __init__( self._web_client = web_client self._recursion_level = recursion_level + def _fetch(self) -> Union[str, WebClientErrorResponse]: + log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...") + response = get_url_retry_on_client_errors( + url=self._url, web_client=self._web_client + ) + + if isinstance(response, WebClientErrorResponse): + return response + + assert isinstance(response, AbstractWebClientSuccessResponse) + + return ungzipped_response_content(url=self._url, response=response) + def sitemap(self) -> AbstractSitemap: """ Fetch and parse the sitemap. @@ -108,21 +122,14 @@ def sitemap(self) -> AbstractSitemap: :return: the parsed sitemap. Will be a child of :class:`~.AbstractSitemap`. If an HTTP error is encountered, or the sitemap cannot be parsed, will be :class:`~.InvalidSitemap`. """ - log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...") - response = get_url_retry_on_client_errors( - url=self._url, web_client=self._web_client - ) + response_content = self._fetch() - if isinstance(response, WebClientErrorResponse): + if isinstance(response_content, WebClientErrorResponse): return InvalidSitemap( url=self._url, - reason=f"Unable to fetch sitemap from {self._url}: {response.message()}", + reason=f"Unable to fetch sitemap from {self._url}: {response_content.message()}", ) - assert isinstance(response, AbstractWebClientSuccessResponse) - - response_content = ungzipped_response_content(url=self._url, response=response) - # MIME types returned in Content-Type are unpredictable, so peek into the content instead if response_content[:20].strip().startswith("<"): # XML sitemap (the specific kind is to be determined later) @@ -156,6 +163,31 @@ def sitemap(self) -> AbstractSitemap: return sitemap +class SitemapStrParser(SitemapFetcher): + """Custom fetcher to parse a string instead of download from a URL. + + This is a little bit hacky, but it allows us to support local content parsing without + having to change too much. + """ + + __slots__ = ["_static_content"] + + def __init__(self, static_content: str): + """Init a new string parser + + :param static_content: String containing sitemap text to parse + """ + super().__init__( + url="http://usp-local-dummy.local/", + recursion_level=0, + web_client=LocalWebClient(), + ) + self._static_content = static_content + + def _fetch(self) -> Union[str, WebClientErrorResponse]: + return self._static_content + + class AbstractSitemapParser(metaclass=abc.ABCMeta): """Abstract robots.txt / XML / plain text sitemap parser.""" @@ -239,6 +271,10 @@ def sitemap(self) -> AbstractSitemap: web_client=self._web_client, ) fetched_sitemap = fetcher.sitemap() + except NoWebClientException: + fetched_sitemap = InvalidSitemap( + url=sitemap_url, reason="Un-fetched child sitemap" + ) except Exception as ex: fetched_sitemap = InvalidSitemap( url=sitemap_url, @@ -538,6 +574,10 @@ def sitemap(self) -> AbstractSitemap: web_client=self._web_client, ) fetched_sitemap = fetcher.sitemap() + except NoWebClientException: + fetched_sitemap = InvalidSitemap( + url=sub_sitemap_url, reason="Un-fetched child sitemap" + ) except Exception as ex: fetched_sitemap = InvalidSitemap( url=sub_sitemap_url, diff --git a/usp/tree.py b/usp/tree.py index 2045e01..6b76bce 100644 --- a/usp/tree.py +++ b/usp/tree.py @@ -3,7 +3,7 @@ from typing import Optional from .exceptions import SitemapException -from .fetch_parse import SitemapFetcher +from .fetch_parse import SitemapFetcher, SitemapStrParser from .helpers import is_http_url, strip_url_to_homepage from .log import create_logger from .objects.sitemap import ( @@ -101,3 +101,15 @@ def sitemap_tree_for_homepage( index_sitemap = IndexWebsiteSitemap(url=homepage_url, sub_sitemaps=sitemaps) return index_sitemap + + +def sitemap_from_str(content: str) -> AbstractSitemap: + """Parse sitemap from a string. + + Will return the parsed sitemaps, and any sub-sitemaps will be returned as :class:`~.InvalidSitemap`. + + :param content: Sitemap string to parse + :return: Parsed sitemap + """ + fetcher = SitemapStrParser(static_content=content) + return fetcher.sitemap() diff --git a/usp/web_client/abstract_client.py b/usp/web_client/abstract_client.py index d828ddc..ebf0b15 100644 --- a/usp/web_client/abstract_client.py +++ b/usp/web_client/abstract_client.py @@ -166,3 +166,24 @@ def get(self, url: str) -> AbstractWebClientResponse: :return: Response object. """ raise NotImplementedError("Abstract method.") + + +class NoWebClientException(Exception): + """Error indicating this web client cannot fetch pages.""" + + pass + + +class LocalWebClient(AbstractWebClient): + """Dummy web client which is a valid implementation but errors if called. + + Used for local parsing + """ + + def set_max_response_data_length( + self, max_response_data_length: Optional[int] + ) -> None: + pass + + def get(self, url: str) -> AbstractWebClientResponse: + raise NoWebClientException From 099ddab7b1ea6fa4061eebc5ec335bc397ec7229 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 6 Sep 2024 13:08:09 +0100 Subject: [PATCH 57/79] Avoid error if priority is invalid --- usp/fetch_parse.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index 105968e..817fb79 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -11,9 +11,10 @@ import re import xml.parsers.expat from collections import OrderedDict -from decimal import Decimal +from decimal import Decimal, InvalidOperation from typing import Optional, Dict, Union + from .exceptions import SitemapException, SitemapXMLParsingException from .helpers import ( html_unescape_strip, @@ -591,6 +592,10 @@ def sitemap(self) -> AbstractSitemap: return index_sitemap +MIN_VALID_PRIORITY = Decimal("0.0") +MAX_VALID_PRIORITY = Decimal("1.0") + + class PagesXMLSitemapParser(AbstractXMLSitemapParser): """ Pages XML sitemap parser. @@ -663,20 +668,15 @@ def page(self) -> Optional[SitemapPage]: priority = html_unescape_strip(self.priority) if priority: - priority = Decimal(priority) - - comp_zero = priority.compare(Decimal("0.0")) - comp_one = priority.compare(Decimal("1.0")) - if comp_zero in ( - Decimal("0"), - Decimal("1") and comp_one in (Decimal("0"), Decimal("-1")), - ): - # 0 <= priority <= 1 - pass - else: - log.warning(f"Priority is not within 0 and 1: {priority}") + try: + priority = Decimal(priority) + + if priority < MIN_VALID_PRIORITY or priority > MAX_VALID_PRIORITY: + log.warning(f"Priority is not within 0 and 1: {priority}") + priority = SITEMAP_PAGE_DEFAULT_PRIORITY + except InvalidOperation: + log.warning(f"Invalid priority: {priority}") priority = SITEMAP_PAGE_DEFAULT_PRIORITY - else: priority = SITEMAP_PAGE_DEFAULT_PRIORITY From fcfd2c7d8827013a60f61bb7987f9e233a3a5aa0 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 6 Sep 2024 14:32:07 +0100 Subject: [PATCH 58/79] Support image sitemap extension --- docs/reference/api/usp.objects.page.rst | 3 + tests/tree/test_xml_exts.py | 107 ++++++++++++++++ usp/fetch_parse.py | 73 ++++++++++- usp/objects/page.py | 161 +++++++++++++++++++----- usp/objects/sitemap.py | 4 +- 5 files changed, 315 insertions(+), 33 deletions(-) create mode 100644 tests/tree/test_xml_exts.py diff --git a/docs/reference/api/usp.objects.page.rst b/docs/reference/api/usp.objects.page.rst index bde277a..1403b1a 100644 --- a/docs/reference/api/usp.objects.page.rst +++ b/docs/reference/api/usp.objects.page.rst @@ -12,3 +12,6 @@ usp.objects.page .. autoclass:: SitemapNewsStory :members: +.. autoclass:: SitemapImage + :members: + diff --git a/tests/tree/test_xml_exts.py b/tests/tree/test_xml_exts.py new file mode 100644 index 0000000..78735e3 --- /dev/null +++ b/tests/tree/test_xml_exts.py @@ -0,0 +1,107 @@ +import textwrap + +from tests.tree.base import TreeTestBase +from usp.objects.page import SitemapImage, SitemapPage +from usp.objects.sitemap import ( + IndexRobotsTxtSitemap, + IndexWebsiteSitemap, + PagesXMLSitemap, +) +from usp.tree import sitemap_tree_for_homepage + + +class TestXMLExts(TreeTestBase): + def test_xml_image(self, requests_mock): + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap_images.xml + + """ + ).strip(), + ) + + requests_mock.get( + self.TEST_BASE_URL + "/sitemap_images.xml", + headers={"Content-Type": "text/xml"}, + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/sample1.html + + {self.TEST_BASE_URL}/image.jpg + Example Caption + Sheffield, UK + Example Title + https://creativecommons.org/publicdomain/zero/1.0/ + + + {self.TEST_BASE_URL}/photo.jpg + + + + {self.TEST_BASE_URL}/sample2.html + + {self.TEST_BASE_URL}/picture.jpg + + + + """ + ).strip(), + ) + + tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) + + expected_sitemap_tree = IndexWebsiteSitemap( + url=f"{self.TEST_BASE_URL}/", + sub_sitemaps=[ + IndexRobotsTxtSitemap( + url=f"{self.TEST_BASE_URL}/robots.txt", + sub_sitemaps=[ + PagesXMLSitemap( + url=f"{self.TEST_BASE_URL}/sitemap_images.xml", + pages=[ + SitemapPage( + url=f"{self.TEST_BASE_URL}/sample1.html", + images=[ + SitemapImage( + loc=f"{self.TEST_BASE_URL}/image.jpg", + caption="Example Caption", + geo_location="Sheffield, UK", + title="Example Title", + license_="https://creativecommons.org/publicdomain/zero/1.0/", + ), + SitemapImage( + loc=f"{self.TEST_BASE_URL}/photo.jpg" + ), + ], + ), + SitemapPage( + url=f"{self.TEST_BASE_URL}/sample2.html", + images=[ + SitemapImage( + loc=f"{self.TEST_BASE_URL}/picture.jpg" + ), + ], + ), + ], + ) + ], + ) + ], + ) + + print(tree.to_dict()) + print(tree) + + assert tree == expected_sitemap_tree diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index 817fb79..2e4d8fa 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -26,6 +26,7 @@ ) from .log import create_logger from .objects.page import ( + SitemapImage, SitemapPage, SitemapNewsStory, SitemapPageChangeFrequency, @@ -403,6 +404,10 @@ def __normalize_xml_element_name(cls, name: str): name = f"sitemap:{name}" elif "/sitemap-news/" in namespace_url: name = f"news:{name}" + elif "/sitemap-image/" in namespace_url: + name = f"image:{name}" + elif "/sitemap-video/" in namespace_url: + name = f"video:{name}" else: # We don't care about the rest of the namespaces, so just keep the plain element name pass @@ -601,6 +606,24 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser): Pages XML sitemap parser. """ + class Image: + __slots__ = ["loc", "caption", "geo_location", "title", "license"] + + def __init__(self): + self.loc = None + self.caption = None + self.geo_location = None + self.title = None + self.license = None + + def __hash__(self): + return hash( + ( + # Hash only the URL to be able to find unique ones + self.loc, + ) + ) + class Page: """Simple data class for holding various properties for a single entry while parsing.""" @@ -617,6 +640,7 @@ class Page: "news_genres", "news_keywords", "news_stock_tickers", + "images", ] def __init__(self): @@ -632,6 +656,7 @@ def __init__(self): self.news_genres = None self.news_keywords = None self.news_stock_tickers = None + self.images = [] def __hash__(self): return hash( @@ -723,15 +748,29 @@ def page(self) -> Optional[SitemapPage]: stock_tickers=news_stock_tickers, ) + sitemap_images = None + if len(self.images) > 0: + sitemap_images = [ + SitemapImage( + loc=image.loc, + caption=image.caption, + geo_location=image.geo_location, + title=image.title, + license_=image.license, + ) + for image in self.images + ] + return SitemapPage( url=url, last_modified=last_modified, change_frequency=change_frequency, priority=priority, news_story=sitemap_news_story, + images=sitemap_images, ) - __slots__ = ["_current_page", "_pages", "_page_urls"] + __slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"] def __init__(self, url: str): super().__init__(url=url) @@ -739,6 +778,7 @@ def __init__(self, url: str): self._current_page = None self._pages = [] self._page_urls = set() + self._current_image = None def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: super().xml_element_start(name=name, attrs=attrs) @@ -749,6 +789,16 @@ def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: "Page is expected to be unset by ." ) self._current_page = self.Page() + elif name == "image:image": + if self._current_image: + raise SitemapXMLParsingException( + "Image is expected to be unset by ." + ) + if not self._current_page: + raise SitemapXMLParsingException( + "Page is expected to be set before ." + ) + self._current_image = self.Image() def __require_last_char_data_to_be_set(self, name: str) -> None: if not self._last_char_data: @@ -767,7 +817,9 @@ def xml_element_end(self, name: str) -> None: self._pages.append(self._current_page) self._page_urls.add(self._current_page.url) self._current_page = None - + elif name == "image:image": + self._current_page.images.append(self._current_image) + self._current_image = None else: if name == "sitemap:loc": # Every entry must have @@ -815,6 +867,23 @@ def xml_element_end(self, name: str) -> None: # Element might be present but character data might be empty self._current_page.news_stock_tickers = self._last_char_data + elif name == "image:loc": + # Every image entry must have + self.__require_last_char_data_to_be_set(name=name) + self._current_image.loc = self._last_char_data + + elif name == "image:caption": + self._current_image.caption = self._last_char_data + + elif name == "image:geo_location": + self._current_image.geo_location = self._last_char_data + + elif name == "image:title": + self._current_image.title = self._last_char_data + + elif name == "image:license": + self._current_image.license = self._last_char_data + super().xml_element_end(name=name) def sitemap(self) -> AbstractSitemap: diff --git a/usp/objects/page.py b/usp/objects/page.py index 4fb0f67..191466d 100644 --- a/usp/objects/page.py +++ b/usp/objects/page.py @@ -122,7 +122,7 @@ def __hash__(self): ) ) - def __repr__(self): + def __repr__(self) -> str: return ( f"{self.__class__.__name__}(" f"title={self.title}, " @@ -215,6 +215,116 @@ def stock_tickers(self) -> List[str]: return self.__stock_tickers +class SitemapImage: + """ + Single image derived from Google Image XML sitemap. + + All properties except ``loc`` are now deprecated in the XML specification, see + https://developers.google.com/search/blog/2022/05/spring-cleaning-sitemap-extensions + + They will continue to be supported here. + """ + + __slots__ = ["__loc", "__caption", "__geo_location", "__title", "__license"] + + def __init__( + self, + loc: str, + caption: Optional[str] = None, + geo_location: Optional[str] = None, + title: Optional[str] = None, + license_: Optional[str] = None, + ): + """Initialise a Google Image. + + :param loc: the URL of the image + :param caption: the caption of the image, optional + :param geo_location: the geographic location of the image, for example "Limerick, Ireland", optional + :param title: the title of the image, optional + :param license_: a URL to the license of the image, optional + """ + self.__loc = loc + self.__caption = caption + self.__geo_location = geo_location + self.__title = title + self.__license = license_ + + def __eq__(self, other) -> bool: + if not isinstance(other, SitemapImage): + raise NotImplementedError + + if self.loc != other.loc: + return False + + if self.caption != other.caption: + return False + + if self.geo_location != other.geo_location: + return False + + if self.title != other.title: + return False + + if self.license != other.license: + return False + + return True + + def to_dict(self): + """Convert to a dictionary representation. + + :return: the image data as a dictionary + """ + return { + "loc": self.loc, + "caption": self.caption, + "geo_location": self.geo_location, + "title": self.title, + "license": self.license, + } + + def __hash__(self): + return hash( + (self.loc, self.caption, self.geo_location, self.title, self.license) + ) + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}(" + f"loc={self.loc}, " + f"caption={self.caption}, " + f"geo_location={self.geo_location}, " + f"title={self.title}, " + f"license={self.license}" + ")" + ) + + @property + def loc(self) -> str: + """Get the URL of the image.""" + return self.__loc + + @property + def caption(self) -> Optional[str]: + """Get the caption of the image.""" + return self.__caption + + @property + def geo_location(self) -> Optional[str]: + """Get the geographic location of the image.""" + return self.__geo_location + + @property + def title(self) -> Optional[str]: + """Get the title of the image.""" + return self.__title + + @property + def license(self) -> Optional[str]: + """Get a URL to the license of the image.""" + return self.__license + + @unique class SitemapPageChangeFrequency(Enum): """Change frequency of a sitemap URL.""" @@ -242,6 +352,7 @@ class SitemapPage: "__last_modified", "__change_frequency", "__news_story", + "__images", ] def __init__( @@ -251,6 +362,7 @@ def __init__( last_modified: Optional[datetime.datetime] = None, change_frequency: Optional[SitemapPageChangeFrequency] = None, news_story: Optional[SitemapNewsStory] = None, + images: Optional[List[SitemapImage]] = None, ): """ Initialize a new sitemap-derived page. @@ -266,6 +378,7 @@ def __init__( self.__last_modified = last_modified self.__change_frequency = change_frequency self.__news_story = news_story + self.__images = images def __eq__(self, other) -> bool: if not isinstance(other, SitemapPage): @@ -286,6 +399,9 @@ def __eq__(self, other) -> bool: if self.news_story != other.news_story: return False + if self.images != other.images: + return False + return True def __hash__(self): @@ -296,14 +412,15 @@ def __hash__(self): ) ) - def __repr__(self): + def __repr__(self) -> str: return ( f"{self.__class__.__name__}(" f"url={self.url}, " f"priority={self.priority}, " f"last_modified={self.last_modified}, " f"change_frequency={self.change_frequency}, " - f"news_story={self.news_story}" + f"news_story={self.news_story}, " + f"images={self.images}" ")" ) @@ -320,49 +437,37 @@ def to_dict(self): if self.change_frequency else None, "news_story": self.news_story.to_dict() if self.news_story else None, + "images": [image.to_dict() for image in self.images] + if self.images + else None, } @property def url(self) -> str: - """ - Return page URL. - - :return: Page URL. - """ + """Get the page URL.""" return self.__url @property def priority(self) -> Decimal: - """ - Return priority of this URL relative to other URLs on your site. - - :return: Priority of this URL relative to other URLs on your site. - """ + """Get the priority of this URL relative to other URLs on the site.""" return self.__priority @property def last_modified(self) -> Optional[datetime.datetime]: - """ - Return date of last modification of the URL. - - :return: Date of last modification of the URL. - """ + """Get the date of last modification of the URL.""" return self.__last_modified @property def change_frequency(self) -> Optional[SitemapPageChangeFrequency]: - """ - Return change frequency of a sitemap URL. - - :return: Change frequency of a sitemap URL. - """ + """Get the change frequency of a sitemap URL.""" return self.__change_frequency @property def news_story(self) -> Optional[SitemapNewsStory]: - """ - Return Google News story attached to the URL. - - :return: Google News story attached to the URL. - """ + """Get the Google News story attached to the URL.""" return self.__news_story + + @property + def images(self) -> Optional[List[SitemapImage]]: + """Get the images attached to the URL.""" + return self.__images diff --git a/usp/objects/sitemap.py b/usp/objects/sitemap.py index 97cd038..9cdad94 100644 --- a/usp/objects/sitemap.py +++ b/usp/objects/sitemap.py @@ -246,9 +246,7 @@ def __eq__(self, other) -> bool: return True def __repr__(self): - return ( - f"{self.__class__.__name__}(" f"url={self.url}, " f"pages={self.pages}" ")" - ) + return f"{self.__class__.__name__}(url={self.url}, pages={self.pages})" def __getstate__(self) -> tuple[None, dict]: # Load slots of this class and its parents (mangling if appropriate) From 33962cca5c44f48cd822c5db9fafc552023fa363 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Fri, 6 Sep 2024 15:02:08 +0100 Subject: [PATCH 59/79] Add docs for sitemap image extension --- docs/changelog.rst | 2 ++ docs/guides/_sitemap_examples/atom0.3.xml | 18 ------------ docs/guides/_sitemap_examples/atom1.0.xml | 18 ------------ docs/guides/_sitemap_examples/class-tree.dot | 4 --- docs/guides/_sitemap_examples/google-news.xml | 29 ------------------- docs/guides/_sitemap_examples/plaintext.txt | 4 --- docs/guides/_sitemap_examples/rss2.0.xml | 18 ------------ .../guides/_sitemap_examples/simple-index.xml | 11 ------- .../_sitemap_examples/simple-urlset.xml | 11 ------- .../{_sitemap_examples => }/bbc-sitemap.dot | 0 docs/guides/sitemap-tree.rst | 2 +- docs/reference/formats.rst | 20 +++++++++++-- .../formats_examples/google-image.xml | 23 +++++++++++++++ usp/fetch_parse.py | 1 + 14 files changed, 45 insertions(+), 116 deletions(-) delete mode 100644 docs/guides/_sitemap_examples/atom0.3.xml delete mode 100644 docs/guides/_sitemap_examples/atom1.0.xml delete mode 100644 docs/guides/_sitemap_examples/class-tree.dot delete mode 100644 docs/guides/_sitemap_examples/google-news.xml delete mode 100644 docs/guides/_sitemap_examples/plaintext.txt delete mode 100644 docs/guides/_sitemap_examples/rss2.0.xml delete mode 100644 docs/guides/_sitemap_examples/simple-index.xml delete mode 100644 docs/guides/_sitemap_examples/simple-urlset.xml rename docs/guides/{_sitemap_examples => }/bbc-sitemap.dot (100%) create mode 100644 docs/reference/formats_examples/google-image.xml diff --git a/docs/changelog.rst b/docs/changelog.rst index 7f6b82e..ee517d0 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -14,6 +14,7 @@ v1.0.0 (upcoming) - Sitemaps and pages now implement ``to_dict()`` method to convert to dictionaries - Added optional arguments to ``usp.tree.sitemap_tree_for_homepage()`` to disable robots.txt-based or known-path-based sitemap discovery. Default behaviour is still to use both. - Parse sitemaps from a string with :ref:`local parse` +- Support for the Google Image sitemap extension **Performance** @@ -25,6 +26,7 @@ Improvement of parse performance by approximately 90%: **Bug Fixes** - Invalid datetimes will be parsed as ``None`` instead of crashing (reported in :issue:`22`, :issue:`31`) +- Invalid priorities will be set to the default (0.5) instead of crashing - Moved ``__version__`` attribute into main class module - Robots.txt index sitemaps now count for the max recursion depth (reported in :issue:`29`). The default maximum has been increased by 1 to compensate for this. diff --git a/docs/guides/_sitemap_examples/atom0.3.xml b/docs/guides/_sitemap_examples/atom0.3.xml deleted file mode 100644 index 967cf41..0000000 --- a/docs/guides/_sitemap_examples/atom0.3.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - Example - - 2024-01-01 - - Page 1 - - https://example.org/page1 - 2024-01-01 - - - Page 2 - - https://example.org/page2 - 2024-01-02 - - \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/atom1.0.xml b/docs/guides/_sitemap_examples/atom1.0.xml deleted file mode 100644 index 4f35803..0000000 --- a/docs/guides/_sitemap_examples/atom1.0.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - Example - - 2024-01-01 - - Page 1 - - https://example.org/page1 - 2024-01-01 - - - Page 2 - - https://example.org/page2 - 2024-01-02 - - \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/class-tree.dot b/docs/guides/_sitemap_examples/class-tree.dot deleted file mode 100644 index 2fd9c48..0000000 --- a/docs/guides/_sitemap_examples/class-tree.dot +++ /dev/null @@ -1,4 +0,0 @@ -digraph G { - root [label="IndexWebsiteSitemap"] - robots [label="IndexRobots -} \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/google-news.xml b/docs/guides/_sitemap_examples/google-news.xml deleted file mode 100644 index baad2df..0000000 --- a/docs/guides/_sitemap_examples/google-news.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - - https://example.org/news/one - - - - Example.org News - - 2024-01-01 - News Article One - - - - - https://example.org/news/two - - - - Example.org News - - 2024-01-02 - News Article Two - - - \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/plaintext.txt b/docs/guides/_sitemap_examples/plaintext.txt deleted file mode 100644 index 3e2c98c..0000000 --- a/docs/guides/_sitemap_examples/plaintext.txt +++ /dev/null @@ -1,4 +0,0 @@ -Yes, there are plain text sitemaps and they could just contain random text. - -https://example.org/page1 -https://example.org/page2 \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/rss2.0.xml b/docs/guides/_sitemap_examples/rss2.0.xml deleted file mode 100644 index 21d702a..0000000 --- a/docs/guides/_sitemap_examples/rss2.0.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - Example - https://example.org/ - Example - - Page 1 - https://example.org/page1 - 2024-01-01 - - - Page 2 - https://example.org/page2 - 2024-01-02 - - - \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/simple-index.xml b/docs/guides/_sitemap_examples/simple-index.xml deleted file mode 100644 index bb7e101..0000000 --- a/docs/guides/_sitemap_examples/simple-index.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - https://example.org/sitemap1.xml - 2024-01-01 - - - https://example.org/sitemap2.xml - 2024-01-02 - - \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/simple-urlset.xml b/docs/guides/_sitemap_examples/simple-urlset.xml deleted file mode 100644 index dd96ad5..0000000 --- a/docs/guides/_sitemap_examples/simple-urlset.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - https://example.org/page1 - 2024-01-01 - - - https://example.org/page2 - 2024-01-02 - - \ No newline at end of file diff --git a/docs/guides/_sitemap_examples/bbc-sitemap.dot b/docs/guides/bbc-sitemap.dot similarity index 100% rename from docs/guides/_sitemap_examples/bbc-sitemap.dot rename to docs/guides/bbc-sitemap.dot diff --git a/docs/guides/sitemap-tree.rst b/docs/guides/sitemap-tree.rst index 9566cbc..f197b53 100644 --- a/docs/guides/sitemap-tree.rst +++ b/docs/guides/sitemap-tree.rst @@ -145,7 +145,7 @@ Large and well-established sites (e.g. media outlets) may have very complex site .. dropdown:: bbc.co.uk Sitemap Graph - .. graphviz:: _sitemap_examples/bbc-sitemap.dot + .. graphviz:: bbc-sitemap.dot Altogether, this sitemap tree contains 2.6 million URLs spread across 75 sitemaps. The ``robots.txt`` file declares 13 sitemaps, some of which are index sitemaps with as many as 50 page sitemaps. Despite this, USP is able to parse this tree in less than a minute and using no more than 90MiB of memory at peak. diff --git a/docs/reference/formats.rst b/docs/reference/formats.rst index 63ee65d..038ed5f 100644 --- a/docs/reference/formats.rst +++ b/docs/reference/formats.rst @@ -106,9 +106,9 @@ XML Sitemap Extensions .. note:: - Only the Google News extension is supported currently. Other extensions (e.g. `Google Image`_ and `Google Video`_) are not currently supported, and only the standard part of the sitemap will be parsed. + The `Google Video`_ extension is not currently supported, and only the standard part of the sitemap will be parsed. + -.. _Google Image: https://developers.google.com/search/docs/crawling-indexing/sitemaps/image-sitemaps .. _Google Video: https://developers.google.com/search/docs/crawling-indexing/sitemaps/video-sitemaps @@ -132,6 +132,22 @@ The Google News extension provides additional information to describe the news s If the page contains Google News data, it is stored as a :class:`~usp.objects.page.SitemapNewsStory` object in :attr:`SitemapPage.news_story `. +Google Image +"""""""""""" + +- `Google documentation `__ + +.. dropdown:: Example + :class-container: flush + + .. literalinclude:: formats_examples/google-image.xml + :emphasize-lines: 3,8-13,19-21 + :language: xml + +The Google Image extension provides additional information to describe images on the page. + +If the page contains Google Image data, it is stored as a list of :class:`~usp.objects.page.SitemapImage` objects in :attr:`SitemapPage.images `. + .. _xml date: Date Time Parsing diff --git a/docs/reference/formats_examples/google-image.xml b/docs/reference/formats_examples/google-image.xml new file mode 100644 index 0000000..45c49aa --- /dev/null +++ b/docs/reference/formats_examples/google-image.xml @@ -0,0 +1,23 @@ + + + + + https://example.org/news/one + + + https://example.com/image.jpg + + + https://example.com/photo.jpg + + + + + https://example.org/news/two + + + https://example.com/image2.jpg + + + \ No newline at end of file diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index 2e4d8fa..ae995e4 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -607,6 +607,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser): """ class Image: + """Data class for holding image data while parsing.""" __slots__ = ["loc", "caption", "geo_location", "title", "license"] def __init__(self): From 3a96d59634fa07c2b700eb4056eb523a193ff3b9 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Mon, 16 Dec 2024 11:18:09 +0000 Subject: [PATCH 60/79] minor formatting --- docs/conf.py | 8 +- poetry.lock | 348 ++++++++++++++++++++++---------------------- usp/objects/page.py | 44 ++---- usp/tree.py | 1 - 4 files changed, 190 insertions(+), 211 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index c46cff7..2f623d8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -7,6 +7,8 @@ sys.path.append(os.path.abspath('extensions')) +from usp import __version__ as usp_version + # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information @@ -16,7 +18,7 @@ project = 'Ultimate Sitemap Parser' copyright = '2018-2024, Ultimate Sitemap Parser Contributors' author = 'Ultimate Sitemap Parser Contributors' -release = '0.5.0' +release = usp_version # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration @@ -31,8 +33,7 @@ 'sphinx_design', 'sphinxext.opengraph', 'sphinx_copybutton', - 'custom_graphviz', - + 'custom_graphviz' ] templates_path = ['_templates'] @@ -58,6 +59,7 @@ 'source_branch': 'master', 'source_directory': 'docs/' } +html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "https://usp-dummy.gate.ac.uk/") # -- Extension Config -------------------------------------------------------- diff --git a/poetry.lock b/poetry.lock index b920318..2a4e863 100644 --- a/poetry.lock +++ b/poetry.lock @@ -650,72 +650,72 @@ windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pyinstrument" -version = "4.7.2" +version = "4.7.3" description = "Call stack profiler for Python. Shows you why your code is slow!" optional = false python-versions = ">=3.8" files = [ - {file = "pyinstrument-4.7.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a316a929a29e4fb1c0a122c503e9442580daf485be20bd713fcc60b98bb48509"}, - {file = "pyinstrument-4.7.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:50c56106e4b3a92dbf1c9d36b307cf67c5b667ae35195d41cf1ded7afc26a01a"}, - {file = "pyinstrument-4.7.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:528b6c8267ebe114d04c8e189f80907b6af9e7a7d6a6597f2833ddcfedbde66f"}, - {file = "pyinstrument-4.7.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f856e7edd39f73d7a68180f03133fc7c6331d3849b8db4d480028c36433ab46"}, - {file = "pyinstrument-4.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6f28831c8386bf820d014282c2e8748049819f61eacb210029fd7e08f45df37"}, - {file = "pyinstrument-4.7.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:78735eb3822746fd12f37ab9a84df35b613b9824b0f8819529c41d9aa09c26c6"}, - {file = "pyinstrument-4.7.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:03dfecfcb7d699b7d8f9d36fb6a11c476233a71eeea78b466c69bca300029603"}, - {file = "pyinstrument-4.7.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b9bd25ba7ef070f538c5e3c6b4a991ce6837a6a2c49c4feba10cb8f5f60182f4"}, - {file = "pyinstrument-4.7.2-cp310-cp310-win32.whl", hash = "sha256:fee18be41331fe0a016c315ea36da4ce965d1fdba051edad16823771e4a0c03d"}, - {file = "pyinstrument-4.7.2-cp310-cp310-win_amd64.whl", hash = "sha256:1a73eb6c07b8c52b976b8a0029dc3dfee83c487f640e97c4b84fcf15cda91caa"}, - {file = "pyinstrument-4.7.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:19c51585e93482cdef7d627f8210f6272d357bf298b6ebd9761bdc2cf50f1b30"}, - {file = "pyinstrument-4.7.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:201eb2460f815efda749a659bf4315d27e964a522c83e04173a052ce89de06d4"}, - {file = "pyinstrument-4.7.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:518f7fbb0f05377391b72e72e8d6942d6413a0d36df0e77a4625b6cbd4ce84fc"}, - {file = "pyinstrument-4.7.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dc1ae87dc6ba8e7fad7ef70996a94a9fd63d5c5c8daa86eb9bc3b2e87f6733a"}, - {file = "pyinstrument-4.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a340ef24718228c57f49750dcac68db1f7d1c9c4d3ce004d3c154f464bacb3d1"}, - {file = "pyinstrument-4.7.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:85e441fcb06d087ae836551dee6a9a9bacf12b0a0c9a6e956376e7c779190474"}, - {file = "pyinstrument-4.7.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fa1f4c0fd2cb118fea3e6d8ba5fcaa9b51c92344841935a7c2c4a8964647273e"}, - {file = "pyinstrument-4.7.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1c8a500c7d077bba643fb3c12fc810f7e1f15fbf37d418cb751f1ee98e275ce6"}, - {file = "pyinstrument-4.7.2-cp311-cp311-win32.whl", hash = "sha256:aa8818f465ed4a6fbe6a2dd59589cc8087fd7ea5faebc32b45c1cb3eb27cfd36"}, - {file = "pyinstrument-4.7.2-cp311-cp311-win_amd64.whl", hash = "sha256:ef64820320ab78f0ce0992104cb7d343ffbb199c015f163fbdc2c66cb3215347"}, - {file = "pyinstrument-4.7.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:10e39476dad9751f2e88a77e50eb5466d16701d9b4efc507a3addce24d1ef43e"}, - {file = "pyinstrument-4.7.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7077831b06d9fec49a92100c8dfd237e1a4c363183746d5a9d44c0174c587547"}, - {file = "pyinstrument-4.7.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2100cf016ee71be21d209d3003ce0dfdac8d74e5e45b9f9ae0a3cfceef7360a"}, - {file = "pyinstrument-4.7.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b00caeff2a7971752a428f9690a337a97ebbdbf14c0f05280b0a4176efd321c"}, - {file = "pyinstrument-4.7.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35dad76e54f0b94f4407579740d91d413ddbc471b465da3782ffa85a87180cbd"}, - {file = "pyinstrument-4.7.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6e6c95ff1e05661457d3f53985a23579cec9fd23639af271fd238ddd545562d4"}, - {file = "pyinstrument-4.7.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:685e998538ba2145fbfe4428534f1cabb5b5719cd5454fbc88c3ab043f2267cb"}, - {file = "pyinstrument-4.7.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0f43db19d1bb923b8b4b50f1d95994151cb04e848acd4740238e3805e87825c3"}, - {file = "pyinstrument-4.7.2-cp312-cp312-win32.whl", hash = "sha256:ef63b4157bf245a2b9543fa71cec71116a4e19c2a6a6ad96623d7b85eaa32119"}, - {file = "pyinstrument-4.7.2-cp312-cp312-win_amd64.whl", hash = "sha256:140203d90e89a06dad86b07cb8d9ab1d763ddc1332502839daac19ff6360ae84"}, - {file = "pyinstrument-4.7.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2df465b065435152473b7c4d0b80c05d3136769251fd7fe725cfcb6eb87340fa"}, - {file = "pyinstrument-4.7.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:50023b396289a27ea5d2f60d78bdeec7e4ccc6051038dfd7f5638c15a314a5d5"}, - {file = "pyinstrument-4.7.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:065451fed990ad050b0fdb4a2bd5f28426f5c5f4b94bd8dab9d144079e073761"}, - {file = "pyinstrument-4.7.2-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:017788c61627f74c3ea503198628bccc46a87e421a282dfb055ff4500026748f"}, - {file = "pyinstrument-4.7.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8df61a879c7316f31791018c92f8cca92cd4dc5a624e629c3d969d77a3657fb"}, - {file = "pyinstrument-4.7.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:656910a5fbb7b99232f8f835815cdf69734b229434c26380c29a0ef09ec9874d"}, - {file = "pyinstrument-4.7.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c2337616952ec3bd35dedb9a1ed396a3accfc0305bc54e22179e77fe63d50909"}, - {file = "pyinstrument-4.7.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ea4e4e7a8ea9a042fa2c4e0efc00d87b29e0af4a1a0b3dba907c3c63cdde4510"}, - {file = "pyinstrument-4.7.2-cp313-cp313-win32.whl", hash = "sha256:24012bc0e5a507189f5f1caa01b4589bb286348e929df6a898c926ffd6e5238a"}, - {file = "pyinstrument-4.7.2-cp313-cp313-win_amd64.whl", hash = "sha256:3d8eaf57bc447b8e108b5d684b371c64232d9895b06a097d8dc2b92f3fdde561"}, - {file = "pyinstrument-4.7.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3cfa57f2a94a52fb3a3e66e910f753b6fd954e20c12407b8e80cc8e50733f771"}, - {file = "pyinstrument-4.7.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4e9a5344b9e8a2748ba610502e7fa951d494591f8e5d8337100108f94bd73e30"}, - {file = "pyinstrument-4.7.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea9af525ce70e9d391b321015e3ef24cccf4df8c51c692492cade49e440b17c2"}, - {file = "pyinstrument-4.7.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b05d17721f99e7356e540a3be84bcad2c4f74144fe3a52d74a7da149f44d03d"}, - {file = "pyinstrument-4.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08581cb58877716d1839950ff0d474516ae743c575dff051babfb066e9c38405"}, - {file = "pyinstrument-4.7.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ad5b688488cab71b601e0aaefd726029f6ddc05525995424387fa88c6f1ce365"}, - {file = "pyinstrument-4.7.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5704125a8b8a0c0d98716d207e1882dfd90fe6c37bf6ac0055b671e43bb13b27"}, - {file = "pyinstrument-4.7.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d704ec91a774066c4a1d1f20046a00e1ef80f50ba9d024919e62365d84b55bdd"}, - {file = "pyinstrument-4.7.2-cp38-cp38-win32.whl", hash = "sha256:6969676c30ce6e078d453a232b074476e32506c5b30a44fc7847cbfe1cb8674f"}, - {file = "pyinstrument-4.7.2-cp38-cp38-win_amd64.whl", hash = "sha256:b6504d60875443bee1f8c31517832b6c054ac0389b745a897484ea1e7edeec5c"}, - {file = "pyinstrument-4.7.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a483be96c025e0287125aad85be3a0bee8687f069e422fb29eab49dd3d53a53d"}, - {file = "pyinstrument-4.7.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ac0caa72765e8f068ad92e9c24c45cf0f4e31c902f403e264199a5667a2e034"}, - {file = "pyinstrument-4.7.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8279d811e86afab5bc31e4aa4f3310b8c5b83682d52cfabee990a9f6a67cd551"}, - {file = "pyinstrument-4.7.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d5b24a14d0fc74e6d9e471088936593cd9f55bb1bfd502e7801913e9d14308e"}, - {file = "pyinstrument-4.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbf90a6b86313ca01b85909e93fb5aaa7a26422a0c6347a07e249b381e77219e"}, - {file = "pyinstrument-4.7.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:e9a96dcbdb272a389fbecb28a5916fab09d2d1a515c997e7bed08c68d5835fbe"}, - {file = "pyinstrument-4.7.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:029855d9bd6bdf66b1948d697261446f049af0b576f0f4b9c2bb5a741a15fefc"}, - {file = "pyinstrument-4.7.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b0331ff6984642a0f66be9e4a66331f1a401948b8bf89ed60990f229fbd10432"}, - {file = "pyinstrument-4.7.2-cp39-cp39-win32.whl", hash = "sha256:4db19ffbb0047e00c6d444ac0e648505982399361aa609b3af9229a971dca79e"}, - {file = "pyinstrument-4.7.2-cp39-cp39-win_amd64.whl", hash = "sha256:b174abcc7438f8aa20a190fcafd8eba099af54af445ce5ea1b28b25750f59652"}, - {file = "pyinstrument-4.7.2.tar.gz", hash = "sha256:8c4e4792e7bc2de6ad757dcb05bb6739b5aed64f834602e8121f611e3278e0d1"}, + {file = "pyinstrument-4.7.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6a79912f8a096ccad1b88a527719563f6b2b5dc94057873c2ca840dc6378cfee"}, + {file = "pyinstrument-4.7.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:089f7afb326ee937656ee1767813dc793ad20b3d353d081e16255b63830a4787"}, + {file = "pyinstrument-4.7.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f65107079f68dcaeb58ee032d98075ab7ac49be419c60673406043e0675393b4"}, + {file = "pyinstrument-4.7.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9402e339d802a7f5b1ad716b8411ab98f45e51c4b261e662b8a470c251af0acc"}, + {file = "pyinstrument-4.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d1f4e0155f563f66e821210c225af8b64a2283c0feff776c49feba623e7bafd"}, + {file = "pyinstrument-4.7.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c619f3064dae5284b904c4862b35639c35ecd439bb5b4152924f7ccb69edc5e3"}, + {file = "pyinstrument-4.7.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9b4d80deaf76cc171b3b707e2babc9a7046610c4e11022167949e60fc2dc62be"}, + {file = "pyinstrument-4.7.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c5fbe9d24154a118a4b86bed5ae228c3d8698216fad65257aca97e790527197a"}, + {file = "pyinstrument-4.7.3-cp310-cp310-win32.whl", hash = "sha256:7405aec2227ed87dc3bc3a8eb82b5dcdec68861d564ee0d429f9a51ca30ccd58"}, + {file = "pyinstrument-4.7.3-cp310-cp310-win_amd64.whl", hash = "sha256:8043b9c1fb0c19a2957098930c3bad43ecdc1cf8e1d3f32a3b9ef74fdd3df028"}, + {file = "pyinstrument-4.7.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:77594adf4713bc3e430e300561a2d837213cf9015414c0e0de6aef0cb9cebd80"}, + {file = "pyinstrument-4.7.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:70afa765c06e4f7605033b85ef82ed946ec8e6ae1835e25f6cbb01205a624197"}, + {file = "pyinstrument-4.7.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b1321514863be18138a6d761696b3f6e8645390dd2f6c8a6d66a453f0d5187c"}, + {file = "pyinstrument-4.7.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de40b44ff2fe78493b944b679cc084e72b2648c37a96fcfbccb9171a4449e509"}, + {file = "pyinstrument-4.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a7c481daec4bd77a3dbfbe01a0155e03352dd700f3c3efe4bdbc30821b20e19"}, + {file = "pyinstrument-4.7.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ae2c966c91da630a23dbff5f7e61ad2eee133cfaf1e4acf7e09fcf506cbb6251"}, + {file = "pyinstrument-4.7.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fa2715e3ac3ce2f4b9c4e468a9a4faf43ca645beea002cb47533902576f4f64d"}, + {file = "pyinstrument-4.7.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:61db15f8b59a3a1964041a8df260667fb5dabddd928301e3580cf93d7a05e352"}, + {file = "pyinstrument-4.7.3-cp311-cp311-win32.whl", hash = "sha256:4766bbb2b451460432c97baf00bbda56653429671e8daec344d343f21fb05b8f"}, + {file = "pyinstrument-4.7.3-cp311-cp311-win_amd64.whl", hash = "sha256:b2d2a0e401db6800f63de0539415cdff46b138914d771a46db0b3f673f9827e7"}, + {file = "pyinstrument-4.7.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7c29f7a23e0f704f5f21aeeb47193460601e7359d09156ea043395870494b39a"}, + {file = "pyinstrument-4.7.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84ceb25f24ceb03dc770b6c142ec4419506d3a04d66d778810cb8da76df25651"}, + {file = "pyinstrument-4.7.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d564d6f6151d3cab28430092cdcbd4aefe0834551af4b4f97e6e57025a348557"}, + {file = "pyinstrument-4.7.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e23ce5fcc30346e576b98ca24bd2a9a68cbc42b90cdb0d8f376fa82cee2fe23"}, + {file = "pyinstrument-4.7.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23d5ad174d2a488c164abee4407f3f3a6e6d5721ab1fab9e0ad9570631704c2"}, + {file = "pyinstrument-4.7.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d87749f68b9cc221628aab989a4a73b16030c27c714ecd83892d716f863d9739"}, + {file = "pyinstrument-4.7.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:897d09c876f18b713498be21430b39428a9254ffec0c6c06796fce0e6a8fe437"}, + {file = "pyinstrument-4.7.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2092910e745cfd0a62dadf041afb38239195244871ee127b1028e7e790602e6b"}, + {file = "pyinstrument-4.7.3-cp312-cp312-win32.whl", hash = "sha256:e9824e11290f6f2772c257cc0bd07f59405759287db6ebcbb06f962a3eba68fb"}, + {file = "pyinstrument-4.7.3-cp312-cp312-win_amd64.whl", hash = "sha256:cf1e67b37e936f647ce731fff5d2f54e102813274d350671dc5961ec8b46b3ff"}, + {file = "pyinstrument-4.7.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6de792dc65dcc75e73b721f4e89aa60a4d2f8617e5a5da060244058018ad0399"}, + {file = "pyinstrument-4.7.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:73da379506a09cdff2fdd23a0b3eb8f020f473d019f604538e0e5045613e33d4"}, + {file = "pyinstrument-4.7.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21e05f53810a6ff5fa261da838935fd1b2ab2bf30a7c053f6c72bcaaa6de0933"}, + {file = "pyinstrument-4.7.3-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d648596ea04409ca3ca260029041ed7fa046b776205bf9a0b75cda0a4f4d2515"}, + {file = "pyinstrument-4.7.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d98997347047a217ef6b844273d3753e543e0984f2220e9dd284cbef6054c2a"}, + {file = "pyinstrument-4.7.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7f09ebad95af94f5427c20005fc7ba84a0a3deae6324434d7ec3be99d369bf37"}, + {file = "pyinstrument-4.7.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8a66aee3d2cf0cc6b8e57cb189fd9fb16d13b8d538419999596ce4f58b5d4a9a"}, + {file = "pyinstrument-4.7.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eaa45270af0b9d86f1cef705520e9b43f4a1cd18397083f8a594a28f898d078b"}, + {file = "pyinstrument-4.7.3-cp313-cp313-win32.whl", hash = "sha256:6e85b34a9b8ed4df4deaa0afe63bc765ea29003eb5b9b3bc0323f7ad7f7cd0fd"}, + {file = "pyinstrument-4.7.3-cp313-cp313-win_amd64.whl", hash = "sha256:6002ea1018d6d6f9b6f1c66b3e14805213573bd69f79b2e7ad2c507441b3e73e"}, + {file = "pyinstrument-4.7.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b68c5b97690604741bb1f028ec75d2a6298500f415590ae92a766f71b82fc72a"}, + {file = "pyinstrument-4.7.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:df9ba133f5a771dd30df1d3b868af75bdb7f12c9ebd5ddd463d09aa6334d96ef"}, + {file = "pyinstrument-4.7.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bfad987207c89b51f80be71f5362cead4ccd62b9f407248b87e91863bba70e4d"}, + {file = "pyinstrument-4.7.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65fd559498902d1560d728238eea53d8dd54cb8f697b816cacce5524f09d8757"}, + {file = "pyinstrument-4.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470a4f6de1a1edf7debe87917b5d12f94fe59975a8a0e91c22ad789b55720073"}, + {file = "pyinstrument-4.7.3-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f29ed5778b83bf40bd808f120cd2ea11ef94acd2aa5b64398e6d56958b88ab26"}, + {file = "pyinstrument-4.7.3-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:6d642d8c69091fd49286136b7d958f8dbac969a3f6259c7c6d78e8ff207d235e"}, + {file = "pyinstrument-4.7.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:346bc584c542c4c77ca46e8f55eb2d3265ee992839e06d535a22ca65c5b9e767"}, + {file = "pyinstrument-4.7.3-cp38-cp38-win32.whl", hash = "sha256:66af331f9da06df36afbdbd2b7128ae725bb444f24584d2ed1f4c67d1b2759b8"}, + {file = "pyinstrument-4.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:57992c5f73fad7b560e27f864ff9824c6ccc834d48bbeaf4cecf66193cfe28c6"}, + {file = "pyinstrument-4.7.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8b944c939c49af88cec1e20e9c28eec80c478fc2fd53b23ed58702bcb5bcbcf9"}, + {file = "pyinstrument-4.7.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:edd85ee9c6aa5be0bf78d48ad2eb5e02fdab1a646875d90fa09cbc61f4c91a01"}, + {file = "pyinstrument-4.7.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e381fc56ba4a77cb45d82eb69689d900a5ee7205a5eb90131234b21ae7a1991"}, + {file = "pyinstrument-4.7.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:98e1b7695c234786e82500394ef50f205713f8702a31aec84fdd0687e0ab8405"}, + {file = "pyinstrument-4.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03dd0c51f6ca706be5c27715e9b4527aa82003c2705d3173943c5b4a2b7a47e8"}, + {file = "pyinstrument-4.7.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2b312442f01fbf2582cd7c929703608cb82874b73a0f3250cbeffc4abddae4f5"}, + {file = "pyinstrument-4.7.3-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e660d9a7f57909574010056dbc80869866623669455516ffc7421988286ddaf3"}, + {file = "pyinstrument-4.7.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:886ccb349aefcbd5be1f33247b3a1af4ad5d34939338d99e94bae064886bf0d8"}, + {file = "pyinstrument-4.7.3-cp39-cp39-win32.whl", hash = "sha256:1ce2828cc29b17720f3c66345ea6f9ff54a3860d0488b59c985377ce2e6a710b"}, + {file = "pyinstrument-4.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:e562e608f878540d19a514774e0f24fccaeac035674cf2b2afacdae9e0e19b29"}, + {file = "pyinstrument-4.7.3.tar.gz", hash = "sha256:3ad61041ff1880d4c99d3384cd267e38a0a6472b5a4dd765992db376bd4394c8"}, ] [package.extras] @@ -902,29 +902,29 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "ruff" -version = "0.6.3" +version = "0.6.4" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.6.3-py3-none-linux_armv6l.whl", hash = "sha256:97f58fda4e309382ad30ede7f30e2791d70dd29ea17f41970119f55bdb7a45c3"}, - {file = "ruff-0.6.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:3b061e49b5cf3a297b4d1c27ac5587954ccb4ff601160d3d6b2f70b1622194dc"}, - {file = "ruff-0.6.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:34e2824a13bb8c668c71c1760a6ac7d795ccbd8d38ff4a0d8471fdb15de910b1"}, - {file = "ruff-0.6.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bddfbb8d63c460f4b4128b6a506e7052bad4d6f3ff607ebbb41b0aa19c2770d1"}, - {file = "ruff-0.6.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ced3eeb44df75353e08ab3b6a9e113b5f3f996bea48d4f7c027bc528ba87b672"}, - {file = "ruff-0.6.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47021dff5445d549be954eb275156dfd7c37222acc1e8014311badcb9b4ec8c1"}, - {file = "ruff-0.6.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:7d7bd20dc07cebd68cc8bc7b3f5ada6d637f42d947c85264f94b0d1cd9d87384"}, - {file = "ruff-0.6.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:500f166d03fc6d0e61c8e40a3ff853fa8a43d938f5d14c183c612df1b0d6c58a"}, - {file = "ruff-0.6.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42844ff678f9b976366b262fa2d1d1a3fe76f6e145bd92c84e27d172e3c34500"}, - {file = "ruff-0.6.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70452a10eb2d66549de8e75f89ae82462159855e983ddff91bc0bce6511d0470"}, - {file = "ruff-0.6.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:65a533235ed55f767d1fc62193a21cbf9e3329cf26d427b800fdeacfb77d296f"}, - {file = "ruff-0.6.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d2e2c23cef30dc3cbe9cc5d04f2899e7f5e478c40d2e0a633513ad081f7361b5"}, - {file = "ruff-0.6.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d8a136aa7d228975a6aee3dd8bea9b28e2b43e9444aa678fb62aeb1956ff2351"}, - {file = "ruff-0.6.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f92fe93bc72e262b7b3f2bba9879897e2d58a989b4714ba6a5a7273e842ad2f8"}, - {file = "ruff-0.6.3-py3-none-win32.whl", hash = "sha256:7a62d3b5b0d7f9143d94893f8ba43aa5a5c51a0ffc4a401aa97a81ed76930521"}, - {file = "ruff-0.6.3-py3-none-win_amd64.whl", hash = "sha256:746af39356fee2b89aada06c7376e1aa274a23493d7016059c3a72e3b296befb"}, - {file = "ruff-0.6.3-py3-none-win_arm64.whl", hash = "sha256:14a9528a8b70ccc7a847637c29e56fd1f9183a9db743bbc5b8e0c4ad60592a82"}, - {file = "ruff-0.6.3.tar.gz", hash = "sha256:183b99e9edd1ef63be34a3b51fee0a9f4ab95add123dbf89a71f7b1f0c991983"}, + {file = "ruff-0.6.4-py3-none-linux_armv6l.whl", hash = "sha256:c4b153fc152af51855458e79e835fb6b933032921756cec9af7d0ba2aa01a258"}, + {file = "ruff-0.6.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:bedff9e4f004dad5f7f76a9d39c4ca98af526c9b1695068198b3bda8c085ef60"}, + {file = "ruff-0.6.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d02a4127a86de23002e694d7ff19f905c51e338c72d8e09b56bfb60e1681724f"}, + {file = "ruff-0.6.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7862f42fc1a4aca1ea3ffe8a11f67819d183a5693b228f0bb3a531f5e40336fc"}, + {file = "ruff-0.6.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eebe4ff1967c838a1a9618a5a59a3b0a00406f8d7eefee97c70411fefc353617"}, + {file = "ruff-0.6.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:932063a03bac394866683e15710c25b8690ccdca1cf192b9a98260332ca93408"}, + {file = "ruff-0.6.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:50e30b437cebef547bd5c3edf9ce81343e5dd7c737cb36ccb4fe83573f3d392e"}, + {file = "ruff-0.6.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c44536df7b93a587de690e124b89bd47306fddd59398a0fb12afd6133c7b3818"}, + {file = "ruff-0.6.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ea086601b22dc5e7693a78f3fcfc460cceabfdf3bdc36dc898792aba48fbad6"}, + {file = "ruff-0.6.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b52387d3289ccd227b62102c24714ed75fbba0b16ecc69a923a37e3b5e0aaaa"}, + {file = "ruff-0.6.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:0308610470fcc82969082fc83c76c0d362f562e2f0cdab0586516f03a4e06ec6"}, + {file = "ruff-0.6.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:803b96dea21795a6c9d5bfa9e96127cc9c31a1987802ca68f35e5c95aed3fc0d"}, + {file = "ruff-0.6.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:66dbfea86b663baab8fcae56c59f190caba9398df1488164e2df53e216248baa"}, + {file = "ruff-0.6.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:34d5efad480193c046c86608dbba2bccdc1c5fd11950fb271f8086e0c763a5d1"}, + {file = "ruff-0.6.4-py3-none-win32.whl", hash = "sha256:f0f8968feea5ce3777c0d8365653d5e91c40c31a81d95824ba61d871a11b8523"}, + {file = "ruff-0.6.4-py3-none-win_amd64.whl", hash = "sha256:549daccee5227282289390b0222d0fbee0275d1db6d514550d65420053021a58"}, + {file = "ruff-0.6.4-py3-none-win_arm64.whl", hash = "sha256:ac4b75e898ed189b3708c9ab3fc70b79a433219e1e87193b4f2b77251d058d14"}, + {file = "ruff-0.6.4.tar.gz", hash = "sha256:ac3b5bfbee99973f80aa1b7cbd1c9cbce200883bdd067300c22a6cc1c7fba212"}, ] [[package]] @@ -1611,103 +1611,103 @@ files = [ [[package]] name = "yarl" -version = "1.9.7" +version = "1.9.11" description = "Yet another URL library" optional = false python-versions = ">=3.8" files = [ - {file = "yarl-1.9.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:60c04415b31a1611ef5989a6084dd6f6b95652c6a18378b58985667b65b2ecb6"}, - {file = "yarl-1.9.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1787dcfdbe730207acb454548a6e19f80ae75e6d2d1f531c5a777bc1ab6f7952"}, - {file = "yarl-1.9.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f5ddad20363f9f1bbedc95789c897da62f939e6bc855793c3060ef8b9f9407bf"}, - {file = "yarl-1.9.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdb156a06208fc9645ae7cc0fca45c40dd40d7a8c4db626e542525489ca81a9"}, - {file = "yarl-1.9.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:522fa3d300d898402ae4e0fa7c2c21311248ca43827dc362a667de87fdb4f1be"}, - {file = "yarl-1.9.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7f9cabfb8b980791b97a3ae3eab2e38b2ba5eab1af9b7495bdc44e1ce7c89e3"}, - {file = "yarl-1.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fc728857df4087da6544fc68f62d7017fa68d74201d5b878e18ed4822c31fb3"}, - {file = "yarl-1.9.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dba2ebac677184d56374fa3e452b461f5d6a03aa132745e648ae8859361eb6b"}, - {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a95167ae34667c5cc7d9206c024f793e8ffbadfb307d5c059de470345de58a21"}, - {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9d319ac113ca47352319cbea92d1925a37cb7bd61a8c2f3e3cd2e96eb33cccae"}, - {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:2d71a5d818d82586ac46265ae01466e0bda0638760f18b21f1174e0dd58a9d2f"}, - {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ff03f1c1ac474c66d474929ae7e4dd195592c1c7cc8c36418528ed81b1ca0a79"}, - {file = "yarl-1.9.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:78250f635f221dde97d02c57aade3313310469bc291888dfe32acd1012594441"}, - {file = "yarl-1.9.7-cp310-cp310-win32.whl", hash = "sha256:f3aaf9fa960d55bd7876d55d7ea3cc046f3660df1ff73fc1b8c520a741ed1f21"}, - {file = "yarl-1.9.7-cp310-cp310-win_amd64.whl", hash = "sha256:e8362c941e07fbcde851597672a5e41b21dc292b7d5a1dc439b7a93c9a1af5d9"}, - {file = "yarl-1.9.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:596069ddeaf72b5eb36cd714dcd2b5751d0090d05a8d65113b582ed9e1c801fb"}, - {file = "yarl-1.9.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cb870907e8b86b2f32541403da9455afc1e535ce483e579bea0e6e79a0cc751c"}, - {file = "yarl-1.9.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ca5e86be84492fa403c4dcd4dcaf8e1b1c4ffc747b5176f7c3d09878c45719b0"}, - {file = "yarl-1.9.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a99cecfb51c84d00132db909e83ae388793ca86e48df7ae57f1be0beab0dcce5"}, - {file = "yarl-1.9.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25508739e9b44d251172145f54c084b71747b09e4d237dc2abb045f46c36a66e"}, - {file = "yarl-1.9.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:60f3b5aec3146b6992640592856414870f5b20eb688c1f1d5f7ac010a7f86561"}, - {file = "yarl-1.9.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1557456afce5db3d655b5f8a31cdcaae1f47e57958760525c44b76e812b4987"}, - {file = "yarl-1.9.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:71bb1435a84688ed831220c5305d96161beb65cac4a966374475348aa3de4575"}, - {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f87d8645a7a806ec8f66aac5e3b1dcb5014849ff53ffe2a1f0b86ca813f534c7"}, - {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:58e3f01673873b8573da3abe138debc63e4e68541b2104a55df4c10c129513a4"}, - {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8af0bbd4d84f8abdd9b11be9488e32c76b1501889b73c9e2292a15fb925b378b"}, - {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:7fc441408ed0d9c6d2d627a02e281c21f5de43eb5209c16636a17fc704f7d0f8"}, - {file = "yarl-1.9.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a9552367dc440870556da47bb289a806f08ad06fbc4054072d193d9e5dd619ba"}, - {file = "yarl-1.9.7-cp311-cp311-win32.whl", hash = "sha256:628619008680a11d07243391271b46f07f13b75deb9fe92ef342305058c70722"}, - {file = "yarl-1.9.7-cp311-cp311-win_amd64.whl", hash = "sha256:bc23d870864971c8455cfba17498ccefa53a5719ea9f5fce5e7e9c1606b5755f"}, - {file = "yarl-1.9.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d8cf3d0b67996edc11957aece3fbce4c224d0451c7c3d6154ec3a35d0e55f6b"}, - {file = "yarl-1.9.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3a7748cd66fef49c877e59503e0cc76179caf1158d1080228e67e1db14554f08"}, - {file = "yarl-1.9.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a6fa3aeca8efabb0fbbb3b15e0956b0cb77f7d9db67c107503c30af07cd9e00"}, - {file = "yarl-1.9.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf37dd0008e5ac5c3880198976063c491b6a15b288d150d12833248cf2003acb"}, - {file = "yarl-1.9.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87aa5308482f248f8c3bd9311cd6c7dfd98ea1a8e57e35fb11e4adcac3066003"}, - {file = "yarl-1.9.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:867b13c1b361f9ba5d2f84dc5408082f5d744c83f66de45edc2b96793a9c5e48"}, - {file = "yarl-1.9.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48ce93947554c2c85fe97fc4866646ec90840bc1162e4db349b37d692a811755"}, - {file = "yarl-1.9.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fcd3d94b848cba132f39a5b40d80b0847d001a91a6f35a2204505cdd46afe1b2"}, - {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d06d6a8f98dd87646d98f0c468be14b201e47ec6092ad569adf835810ad0dffb"}, - {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:91567ff4fce73d2e7ac67ed5983ad26ba2343bc28cb22e1e1184a9677df98d7c"}, - {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1d5594512541e63188fea640b7f066c218d2176203d6e6f82abf702ae3dca3b2"}, - {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9c2743e43183e4afbb07d5605693299b8756baff0b086c25236c761feb0e3c56"}, - {file = "yarl-1.9.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:daa69a3a2204355af39f4cfe7f3870d87c53d77a597b5100b97e3faa9460428b"}, - {file = "yarl-1.9.7-cp312-cp312-win32.whl", hash = "sha256:36b16884336c15adf79a4bf1d592e0c1ffdb036a760e36a1361565b66785ec6c"}, - {file = "yarl-1.9.7-cp312-cp312-win_amd64.whl", hash = "sha256:2ead2f87a1174963cc406d18ac93d731fbb190633d3995fa052d10cefae69ed8"}, - {file = "yarl-1.9.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:808eddabcb6f7b2cdb6929b3e021ac824a2c07dc7bc83f7618e18438b1b65781"}, - {file = "yarl-1.9.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:395ab0d8ce6d104a988da429bcbfd445e03fb4c911148dfd523f69d13f772e47"}, - {file = "yarl-1.9.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:49827dfccbd59c4499605c13805e947349295466e490860a855b7c7e82ec9c75"}, - {file = "yarl-1.9.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6b8bbdd425d0978311520ea99fb6c0e9e04e64aee84fac05f3157ace9f81b05"}, - {file = "yarl-1.9.7-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71d33fd1c219b5b28ee98cd76da0c9398a4ed4792fd75c94135237db05ba5ca8"}, - {file = "yarl-1.9.7-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:62440431741d0b7d410e5cbad800885e3289048140a43390ecab4f0b96dde3bb"}, - {file = "yarl-1.9.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4db97210433366dfba55590e48285b89ad0146c52bf248dd0da492dd9f0f72cf"}, - {file = "yarl-1.9.7-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:653597b615809f2e5f4dba6cd805608b6fd3597128361a22cc612cf7c7a4d1bf"}, - {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:df47612129e66f7ce7c9994d4cd4e6852f6e3bf97699375d86991481796eeec8"}, - {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5e338b6febbae6c9fe86924bac3ea9c1944e33255c249543cd82a4af6df6047b"}, - {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e649d37d04665dddb90994bbf0034331b6c14144cc6f3fbce400dc5f28dc05b7"}, - {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:0a1b8fd849567be56342e988e72c9d28bd3c77b9296c38b9b42d2fe4813c9d3f"}, - {file = "yarl-1.9.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f9d715b2175dff9a49c6dafdc2ab3f04850ba2f3d4a77f69a5a1786b057a9d45"}, - {file = "yarl-1.9.7-cp313-cp313-win32.whl", hash = "sha256:bc9233638b07c2e4a3a14bef70f53983389bffa9e8cb90a2da3f67ac9c5e1842"}, - {file = "yarl-1.9.7-cp313-cp313-win_amd64.whl", hash = "sha256:62e110772330d7116f91e79cd83fef92545cb2f36414c95881477aa01971f75f"}, - {file = "yarl-1.9.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a564155cc2194ecd9c0d8f8dc57059b822a507de5f08120063675eb9540576aa"}, - {file = "yarl-1.9.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:03e917cc44a01e1be60a83ee1a17550b929490aaa5df2a109adc02137bddf06b"}, - {file = "yarl-1.9.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eefda67ba0ba44ab781e34843c266a76f718772b348f7c5d798d8ea55b95517f"}, - {file = "yarl-1.9.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:316c82b499b6df41444db5dea26ee23ece9356e38cea43a8b2af9e6d8a3558e4"}, - {file = "yarl-1.9.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10452727843bc847596b75e30a7fe92d91829f60747301d1bd60363366776b0b"}, - {file = "yarl-1.9.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:050f3e4d886be55728fef268587d061c5ce6f79a82baba71840801b63441c301"}, - {file = "yarl-1.9.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0aabe557446aa615693a82b4d3803c102fd0e7a6a503bf93d744d182a510184"}, - {file = "yarl-1.9.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23404842228e6fa8ace235024519df37f3f8e173620407644d40ddca571ff0f4"}, - {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:34736fcc9d6d7080ebbeb0998ecb91e4f14ad8f18648cf0b3099e2420a225d86"}, - {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:48f7a158f3ca67509d21cb02a96964e4798b6f133691cc0c86cf36e26e26ec8f"}, - {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:6639444d161c693cdabb073baaed1945c717d3982ecedf23a219bc55a242e728"}, - {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:1cd450e10cb53d63962757c3f6f7870be49a3e448c46621d6bd46f8088d532de"}, - {file = "yarl-1.9.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74d3ef5e81f81507cea04bf5ae22f18ef538607a7c754aac2b6e3029956a2842"}, - {file = "yarl-1.9.7-cp38-cp38-win32.whl", hash = "sha256:4052dbd0c900bece330e3071c636f99dff06e4628461a29b38c6e222a427cf98"}, - {file = "yarl-1.9.7-cp38-cp38-win_amd64.whl", hash = "sha256:dd08da4f2d171e19bd02083c921f1bef89f8f5f87000d0ffc49aa257bc5a9802"}, - {file = "yarl-1.9.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ab906a956d2109c6ea11e24c66592b06336e2743509290117f0f7f47d2c1dd3"}, - {file = "yarl-1.9.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d8ad761493d5aaa7ab2a09736e62b8a220cb0b10ff8ccf6968c861cd8718b915"}, - {file = "yarl-1.9.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d35f9cdab0ec5e20cf6d2bd46456cf599052cf49a1698ef06b9592238d1cf1b1"}, - {file = "yarl-1.9.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a48d2b9f0ae29a456fb766ae461691378ecc6cf159dd9f938507d925607591c3"}, - {file = "yarl-1.9.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cf85599c9336b89b92c313519bcaa223d92fa5d98feb4935a47cce2e8722b4b8"}, - {file = "yarl-1.9.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8e8916b1ff7680b1f2b1608c82dc15c569b9f2cb2da100c747c291f1acf18a14"}, - {file = "yarl-1.9.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29c80890e0a64fb0e5f71350d48da330995073881f8b8e623154aef631febfb0"}, - {file = "yarl-1.9.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9163d21aa40ff8528db2aee2b0b6752efe098055b41ab8e5422b2098457199fe"}, - {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:65e3098969baf221bb45e3b2f60735fc2b154fc95902131ebc604bae4c629ea6"}, - {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:cddebd096effe4be90fd378e4224cd575ac99e1c521598a6900e94959006e02e"}, - {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:8525f955a2dcc281573b6aadeb8ab9c37e2d3428b64ca6a2feec2a794a69c1da"}, - {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:5d585c7d834c13f24c7e3e0efaf1a4b7678866940802e11bd6c4d1f99c935e6b"}, - {file = "yarl-1.9.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:78805148e780a9ca66f3123e04741e344b66cf06b4fb13223e3a209f39a6da55"}, - {file = "yarl-1.9.7-cp39-cp39-win32.whl", hash = "sha256:3f53df493ec80b76969d6e1ae6e4411a55ab1360e02b80c84bd4b33d61a567ba"}, - {file = "yarl-1.9.7-cp39-cp39-win_amd64.whl", hash = "sha256:c81c28221a85add23a0922a6aeb2cdda7f9723e03e2dfae06fee5c57fe684262"}, - {file = "yarl-1.9.7-py3-none-any.whl", hash = "sha256:49935cc51d272264358962d050d726c3e5603a616f53e52ea88e9df1728aa2ee"}, - {file = "yarl-1.9.7.tar.gz", hash = "sha256:f28e602edeeec01fc96daf7728e8052bc2e12a672e2a138561a1ebaf30fd9df7"}, + {file = "yarl-1.9.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:79e08c691deae6fcac2fdde2e0515ac561dd3630d7c8adf7b1e786e22f1e193b"}, + {file = "yarl-1.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:752f4b5cf93268dc73c2ae994cc6d684b0dad5118bc87fbd965fd5d6dca20f45"}, + {file = "yarl-1.9.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:441049d3a449fb8756b0535be72c6a1a532938a33e1cf03523076700a5f87a01"}, + {file = "yarl-1.9.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b3dfe17b4aed832c627319da22a33f27f282bd32633d6b145c726d519c89fbaf"}, + {file = "yarl-1.9.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:67abcb7df27952864440c9c85f1c549a4ad94afe44e2655f77d74b0d25895454"}, + {file = "yarl-1.9.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6de3fa29e76fd1518a80e6af4902c44f3b1b4d7fed28eb06913bba4727443de3"}, + {file = "yarl-1.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fee45b3bd4d8d5786472e056aa1359cc4dc9da68aded95a10cd7929a0ec661fe"}, + {file = "yarl-1.9.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c59b23886234abeba62087fd97d10fb6b905d9e36e2f3465d1886ce5c0ca30df"}, + {file = "yarl-1.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d93c612b2024ac25a3dc01341fd98fdd19c8c5e2011f3dcd084b3743cba8d756"}, + {file = "yarl-1.9.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:4d368e3b9ecd50fa22017a20c49e356471af6ae91c4d788c6e9297e25ddf5a62"}, + {file = "yarl-1.9.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5b593acd45cdd4cf6664d342ceacedf25cd95263b83b964fddd6c78930ea5211"}, + {file = "yarl-1.9.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:224f8186c220ff00079e64bf193909829144d4e5174bb58665ef0da8bf6955c4"}, + {file = "yarl-1.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:91c478741d7563a12162f7a2db96c0d23d93b0521563f1f1f0ece46ea1702d33"}, + {file = "yarl-1.9.11-cp310-cp310-win32.whl", hash = "sha256:1cdb8f5bb0534986776a43df84031da7ff04ac0cf87cb22ae8a6368231949c40"}, + {file = "yarl-1.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:498439af143b43a2b2314451ffd0295410aa0dcbdac5ee18fc8633da4670b605"}, + {file = "yarl-1.9.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9e290de5db4fd4859b4ed57cddfe793fcb218504e65781854a8ac283ab8d5518"}, + {file = "yarl-1.9.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e5f50a2e26cc2b89186f04c97e0ec0ba107ae41f1262ad16832d46849864f914"}, + {file = "yarl-1.9.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b4a0e724a28d7447e4d549c8f40779f90e20147e94bf949d490402eee09845c6"}, + {file = "yarl-1.9.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85333d38a4fa5997fa2ff6fd169be66626d814b34fa35ec669e8c914ca50a097"}, + {file = "yarl-1.9.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ff184002ee72e4b247240e35d5dce4c2d9a0e81fdbef715dde79ab4718aa541"}, + {file = "yarl-1.9.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:675004040f847c0284827f44a1fa92d8baf425632cc93e7e0aa38408774b07c1"}, + {file = "yarl-1.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b30703a7ade2b53f02e09a30685b70cd54f65ed314a8d9af08670c9a5391af1b"}, + {file = "yarl-1.9.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7230007ab67d43cf19200ec15bc6b654e6b85c402f545a6fc565d254d34ff754"}, + {file = "yarl-1.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8c2cf0c7ad745e1c6530fe6521dfb19ca43338239dfcc7da165d0ef2332c0882"}, + {file = "yarl-1.9.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4567cc08f479ad80fb07ed0c9e1bcb363a4f6e3483a490a39d57d1419bf1c4c7"}, + {file = "yarl-1.9.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:95adc179a02949c4560ef40f8f650a008380766eb253d74232eb9c024747c111"}, + {file = "yarl-1.9.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:755ae9cff06c429632d750aa8206f08df2e3d422ca67be79567aadbe74ae64cc"}, + {file = "yarl-1.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:94f71d54c5faf715e92c8434b4a0b968c4d1043469954d228fc031d51086f143"}, + {file = "yarl-1.9.11-cp311-cp311-win32.whl", hash = "sha256:4ae079573efeaa54e5978ce86b77f4175cd32f42afcaf9bfb8a0677e91f84e4e"}, + {file = "yarl-1.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:9fae7ec5c9a4fe22abb995804e6ce87067dfaf7e940272b79328ce37c8f22097"}, + {file = "yarl-1.9.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:614fa50fd0db41b79f426939a413d216cdc7bab8d8c8a25844798d286a999c5a"}, + {file = "yarl-1.9.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ff64f575d71eacb5a4d6f0696bfe991993d979423ea2241f23ab19ff63f0f9d1"}, + {file = "yarl-1.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c23f6dc3d7126b4c64b80aa186ac2bb65ab104a8372c4454e462fb074197bc6"}, + {file = "yarl-1.9.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8f847cc092c2b85d22e527f91ea83a6cf51533e727e2461557a47a859f96734"}, + {file = "yarl-1.9.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:63a5dc2866791236779d99d7a422611d22bb3a3d50935bafa4e017ea13e51469"}, + {file = "yarl-1.9.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c335342d482e66254ae94b1231b1532790afb754f89e2e0c646f7f19d09740aa"}, + {file = "yarl-1.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4a8c3dedd081cca134a21179aebe58b6e426e8d1e0202da9d1cafa56e01af3c"}, + {file = "yarl-1.9.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:504d19320c92532cabc3495fb7ed6bb599f3c2bfb45fed432049bf4693dbd6d0"}, + {file = "yarl-1.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b2a8e5eb18181060197e3d5db7e78f818432725c0759bc1e5a9d603d9246389"}, + {file = "yarl-1.9.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f568d70b7187f4002b6b500c0996c37674a25ce44b20716faebe5fdb8bd356e7"}, + {file = "yarl-1.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:735b285ea46ca7e86ad261a462a071d0968aade44e1a3ea2b7d4f3d63b5aab12"}, + {file = "yarl-1.9.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2d1c81c3b92bef0c1c180048e43a5a85754a61b4f69d6f84df8e4bd615bef25d"}, + {file = "yarl-1.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8d6e1c1562b53bd26efd38e886fc13863b8d904d559426777990171020c478a9"}, + {file = "yarl-1.9.11-cp312-cp312-win32.whl", hash = "sha256:aeba4aaa59cb709edb824fa88a27cbbff4e0095aaf77212b652989276c493c00"}, + {file = "yarl-1.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:569309a3efb8369ff5d32edb2a0520ebaf810c3059f11d34477418c90aa878fd"}, + {file = "yarl-1.9.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:4915818ac850c3b0413e953af34398775b7a337babe1e4d15f68c8f5c4872553"}, + {file = "yarl-1.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ef9610b2f5a73707d4d8bac040f0115ca848e510e3b1f45ca53e97f609b54130"}, + {file = "yarl-1.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:47c0a3dc8076a8dd159de10628dea04215bc7ddaa46c5775bf96066a0a18f82b"}, + {file = "yarl-1.9.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:545f2fbfa0c723b446e9298b5beba0999ff82ce2c126110759e8dac29b5deaf4"}, + {file = "yarl-1.9.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9137975a4ccc163ad5d7a75aad966e6e4e95dedee08d7995eab896a639a0bce2"}, + {file = "yarl-1.9.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0b0c70c451d2a86f8408abced5b7498423e2487543acf6fcf618b03f6e669b0a"}, + {file = "yarl-1.9.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce2bd986b1e44528677c237b74d59f215c8bfcdf2d69442aa10f62fd6ab2951c"}, + {file = "yarl-1.9.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d7b717f77846a9631046899c6cc730ea469c0e2fb252ccff1cc119950dbc296"}, + {file = "yarl-1.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3a26a24bbd19241283d601173cea1e5b93dec361a223394e18a1e8e5b0ef20bd"}, + {file = "yarl-1.9.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c189bf01af155ac9882e128d9f3b3ad68a1f2c2f51404afad7201305df4e12b1"}, + {file = "yarl-1.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0cbcc2c54084b2bda4109415631db017cf2960f74f9e8fd1698e1400e4f8aae2"}, + {file = "yarl-1.9.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:30f201bc65941a4aa59c1236783efe89049ec5549dafc8cd2b63cc179d3767b0"}, + {file = "yarl-1.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:922ba3b74f0958a0b5b9c14ff1ef12714a381760c08018f2b9827632783a590c"}, + {file = "yarl-1.9.11-cp313-cp313-win32.whl", hash = "sha256:17107b4b8c43e66befdcbe543fff2f9c93f7a3a9f8e3a9c9ac42bffeba0e8828"}, + {file = "yarl-1.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:0324506afab4f2e176a93cb08b8abcb8b009e1f324e6cbced999a8f5dd9ddb76"}, + {file = "yarl-1.9.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4e4f820fde9437bb47297194f43d29086433e6467fa28fe9876366ad357bd7bb"}, + {file = "yarl-1.9.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:dfa9b9d5c9c0dbe69670f5695264452f5e40947590ec3a38cfddc9640ae8ff89"}, + {file = "yarl-1.9.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e700eb26635ce665c018c8cfea058baff9b843ed0cc77aa61849d807bb82a64c"}, + {file = "yarl-1.9.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c305c1bdf10869b5e51facf50bd5b15892884aeae81962ae4ba061fc11217103"}, + {file = "yarl-1.9.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5b7b307140231ea4f7aad5b69355aba2a67f2d7bc34271cffa3c9c324d35b27"}, + {file = "yarl-1.9.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a744bdeda6c86cf3025c94eb0e01ccabe949cf385cd75b6576a3ac9669404b68"}, + {file = "yarl-1.9.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e8ed183c7a8f75e40068333fc185566472a8f6c77a750cf7541e11810576ea5"}, + {file = "yarl-1.9.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1db9a4384694b5d20bdd9cb53f033b0831ac816416ab176c8d0997835015d22"}, + {file = "yarl-1.9.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:70194da6e99713250aa3f335a7fa246b36adf53672a2bcd0ddaa375d04e53dc0"}, + {file = "yarl-1.9.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ddad5cfcda729e22422bb1c85520bdf2770ce6d975600573ac9017fe882f4b7e"}, + {file = "yarl-1.9.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ca35996e0a4bed28fa0640d9512d37952f6b50dea583bcc167d4f0b1e112ac7f"}, + {file = "yarl-1.9.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:61ec0e80970b21a8f3c4b97fa6c6d181c6c6a135dbc7b4a601a78add3feeb209"}, + {file = "yarl-1.9.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:9636e4519f6c7558fdccf8f91e6e3b98df2340dc505c4cc3286986d33f2096c2"}, + {file = "yarl-1.9.11-cp38-cp38-win32.whl", hash = "sha256:58081cea14b8feda57c7ce447520e9d0a96c4d010cce54373d789c13242d7083"}, + {file = "yarl-1.9.11-cp38-cp38-win_amd64.whl", hash = "sha256:7d2dee7d6485807c0f64dd5eab9262b7c0b34f760e502243dd83ec09d647d5e1"}, + {file = "yarl-1.9.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d65ad67f981e93ea11f87815f67d086c4f33da4800cf2106d650dd8a0b79dda4"}, + {file = "yarl-1.9.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:752c0d33b4aacdb147871d0754b88f53922c6dc2aff033096516b3d5f0c02a0f"}, + {file = "yarl-1.9.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:54cc24be98d7f4ff355ca2e725a577e19909788c0db6beead67a0dda70bd3f82"}, + {file = "yarl-1.9.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c82126817492bb2ebc946e74af1ffa10aacaca81bee360858477f96124be39a"}, + {file = "yarl-1.9.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8503989860d7ac10c85cb5b607fec003a45049cf7a5b4b72451e87893c6bb990"}, + {file = "yarl-1.9.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:475e09a67f8b09720192a170ad9021b7abf7827ffd4f3a83826317a705be06b7"}, + {file = "yarl-1.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afcac5bda602b74ff701e1f683feccd8cce0d5a21dbc68db81bf9bd8fd93ba56"}, + {file = "yarl-1.9.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaeffcb84faceb2923a94a8a9aaa972745d3c728ab54dd011530cc30a3d5d0c1"}, + {file = "yarl-1.9.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:51a6f770ac86477cd5c553f88a77a06fe1f6f3b643b053fcc7902ab55d6cbe14"}, + {file = "yarl-1.9.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3fcd056cb7dff3aea5b1ee1b425b0fbaa2fbf6a1c6003e88caf524f01de5f395"}, + {file = "yarl-1.9.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:21e56c30e39a1833e4e3fd0112dde98c2abcbc4c39b077e6105c76bb63d2aa04"}, + {file = "yarl-1.9.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:0a205ec6349879f5e75dddfb63e069a24f726df5330b92ce76c4752a436aac01"}, + {file = "yarl-1.9.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a5706821e1cf3c70dfea223e4e0958ea354f4e2af9420a1bd45c6b547297fb97"}, + {file = "yarl-1.9.11-cp39-cp39-win32.whl", hash = "sha256:cc295969f8c2172b5d013c0871dccfec7a0e1186cf961e7ea575d47b4d5cbd32"}, + {file = "yarl-1.9.11-cp39-cp39-win_amd64.whl", hash = "sha256:55a67dd29367ce7c08a0541bb602ec0a2c10d46c86b94830a1a665f7fd093dfa"}, + {file = "yarl-1.9.11-py3-none-any.whl", hash = "sha256:c6f6c87665a9e18a635f0545ea541d9640617832af2317d4f5ad389686b4ed3d"}, + {file = "yarl-1.9.11.tar.gz", hash = "sha256:c7548a90cb72b67652e2cd6ae80e2683ee08fde663104528ac7df12d8ef271d2"}, ] [package.dependencies] diff --git a/usp/objects/page.py b/usp/objects/page.py index 191466d..3829565 100644 --- a/usp/objects/page.py +++ b/usp/objects/page.py @@ -62,6 +62,7 @@ def __init__( self.__stock_tickers = stock_tickers if stock_tickers else [] def __eq__(self, other) -> bool: + """Check equality.""" if not isinstance(other, SitemapNewsStory): raise NotImplementedError @@ -138,45 +139,30 @@ def __repr__(self) -> str: @property def title(self) -> str: - """ - Return story title. - - :return: Story title. - """ + """Get the story title.""" return self.__title @property def publish_date(self) -> datetime.datetime: - """ - Return story publication date. - - :return: Story publication date. - """ + """Get the story publication date.""" return self.__publish_date @property def publication_name(self) -> Optional[str]: - """ - Return name of the news publication in which the article appears in. - - :return: Name of the news publication in which the article appears in. - """ + """Get the name of the news publication in which the article appears.""" return self.__publication_name @property def publication_language(self) -> Optional[str]: - """Return primary language of the news publication in which the article appears in. + """Get the primary language of the news publication in which the article appears. It should be an ISO 639 Language Code (either 2 or 3 letters). - - :return: Primary language of the news publication in which the article appears in. """ return self.__publication_language @property def access(self) -> Optional[str]: - """ - Return accessibility of the article. + """Get the accessibility of the article. :return: Accessibility of the article. """ @@ -184,33 +170,25 @@ def access(self) -> Optional[str]: @property def genres(self) -> List[str]: - """ - Return list of properties characterizing the content of the article. - - Returns genres such as "PressRelease" or "UserGenerated". + """Get list of genres characterizing the content of the article. - :return: List of properties characterizing the content of the article + Genres will be one "PressRelease", "Satire", "Blog", "OpEd", "Opinion", "UserGenerated" """ return self.__genres @property def keywords(self) -> List[str]: - """ - Return list of keywords describing the topic of the article. - - :return: List of keywords describing the topic of the article. - """ + """Get list of keywords describing the topic of the article.""" return self.__keywords @property def stock_tickers(self) -> List[str]: - """ - Return list of up to 5 stock tickers that are the main subject of the article. + """Get stock tickers that are the main subject of the article. Each ticker must be prefixed by the name of its stock exchange, and must match its entry in Google Finance. For example, "NASDAQ:AMAT" (but not "NASD:AMAT"), or "BOM:500325" (but not "BOM:RIL"). - :return: List of up to 5 stock tickers that are the main subject of the article. + Up to 5 tickers can be provided. """ return self.__stock_tickers diff --git a/usp/tree.py b/usp/tree.py index 6b76bce..b44ad9e 100644 --- a/usp/tree.py +++ b/usp/tree.py @@ -1,7 +1,6 @@ """Helpers to generate a sitemap tree.""" from typing import Optional - from .exceptions import SitemapException from .fetch_parse import SitemapFetcher, SitemapStrParser from .helpers import is_http_url, strip_url_to_homepage From 062a62c1023cf94335665f0993581f8b8b90cca2 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Mon, 16 Dec 2024 11:27:19 +0000 Subject: [PATCH 61/79] add rtd config --- .readthedocs.yaml | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 .readthedocs.yaml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..72c661c --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,43 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + jobs: + post_create_environment: + # Install poetry + # https://python-poetry.org/docs/#installing-manually + - pip install poetry + post_install: + # Install dependencies with 'docs' dependency group + # https://python-poetry.org/docs/managing-dependencies/#dependency-groups + # VIRTUAL_ENV needs to be set manually for now. + # See https://github.com/readthedocs/readthedocs.org/pull/11152/ + - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry install --with docs + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +# python: +# install: +# - requirements: docs/requirements.txt \ No newline at end of file From 55aa959b5b25422fa529aeaad94ceeb57351ca61 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Mon, 16 Dec 2024 13:15:36 +0000 Subject: [PATCH 62/79] Fix multi-Python version test issues (#45) * Fix missing images key in save tests * disable fail fast * Change CI branch triggers * change usage of functools for py3.8 * Fix name of lint workflow * fix lint workflow step * lint * fix tuple subscription * change httpstatus usage * Fix graphviz dep in rtd config --- .github/workflows/lint.yml | 16 ++++++++++++---- .github/workflows/test.yml | 11 ++++++++++- .readthedocs.yaml | 2 ++ tests/tree/test_save.py | 6 ++++++ usp/fetch_parse.py | 1 + usp/objects/sitemap.py | 9 +++++---- usp/web_client/requests_client.py | 2 +- 7 files changed, 37 insertions(+), 10 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index d967cd5..bdcabf2 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,12 +1,20 @@ -name: Test +name: Lint -on: [push, pull_request] +on: + push: + branches: + - master + - develop + pull_request: + branches: + - master + - develop permissions: contents: read jobs: - test: + lint: runs-on: ubuntu-latest steps: @@ -22,7 +30,7 @@ jobs: run: poetry install --no-interaction --no-root - name: Install Project run: poetry install --no-interaction - - name: Poetry Build + - name: Ruff Lint Format run: poetry run ruff format --check id: format - name: Ruff Lint Check diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e03be65..a716e87 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,6 +1,14 @@ name: Test -on: [push, pull_request] +on: + push: + branches: + - master + - develop + pull_request: + branches: + - master + - develop permissions: contents: read @@ -10,6 +18,7 @@ jobs: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 72c661c..d29890d 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -13,6 +13,8 @@ build: # nodejs: "20" # rust: "1.70" # golang: "1.20" + apt_packages: + - graphviz jobs: post_create_environment: # Install poetry diff --git a/tests/tree/test_save.py b/tests/tree/test_save.py index 1d78ae0..9716358 100644 --- a/tests/tree/test_save.py +++ b/tests/tree/test_save.py @@ -57,6 +57,7 @@ def test_page_to_dict(self, tree, tmp_path): 2009, 12, 17, 12, 4, 56, tzinfo=tzoffset(None, 7200) ), "change_frequency": "monthly", + "images": None, "news_story": None, }, { @@ -66,6 +67,7 @@ def test_page_to_dict(self, tree, tmp_path): 2009, 12, 17, 12, 4, 56, tzinfo=tzoffset(None, 7200) ), "change_frequency": "always", + "images": None, "news_story": None, }, { @@ -73,6 +75,7 @@ def test_page_to_dict(self, tree, tmp_path): "priority": Decimal("0.5"), "last_modified": None, "change_frequency": None, + "images": None, "news_story": { "title": "Foo ", "publish_date": datetime.datetime( @@ -91,6 +94,7 @@ def test_page_to_dict(self, tree, tmp_path): "priority": Decimal("0.5"), "last_modified": None, "change_frequency": None, + "images": None, "news_story": { "title": "Bar & bar", "publish_date": datetime.datetime( @@ -109,6 +113,7 @@ def test_page_to_dict(self, tree, tmp_path): "priority": Decimal("0.5"), "last_modified": None, "change_frequency": None, + "images": None, "news_story": { "title": "Bar & bar", "publish_date": datetime.datetime( @@ -127,6 +132,7 @@ def test_page_to_dict(self, tree, tmp_path): "priority": Decimal("0.5"), "last_modified": None, "change_frequency": None, + "images": None, "news_story": { "title": "Bąž", "publish_date": datetime.datetime( diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index ae995e4..e960d0e 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -608,6 +608,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser): class Image: """Data class for holding image data while parsing.""" + __slots__ = ["loc", "caption", "geo_location", "title", "license"] def __init__(self): diff --git a/usp/objects/sitemap.py b/usp/objects/sitemap.py index 9cdad94..38933b9 100644 --- a/usp/objects/sitemap.py +++ b/usp/objects/sitemap.py @@ -9,16 +9,17 @@ """ import abc -from functools import cache +from functools import lru_cache import os import pickle import tempfile -from typing import List, Iterator +from typing import List, Iterator, Tuple from .page import SitemapPage -@cache +# TODO: change to functools.cache when dropping py3.8 +@lru_cache(maxsize=None) def _all_slots(target_cls): mro = target_cls.__mro__ @@ -248,7 +249,7 @@ def __eq__(self, other) -> bool: def __repr__(self): return f"{self.__class__.__name__}(url={self.url}, pages={self.pages})" - def __getstate__(self) -> tuple[None, dict]: + def __getstate__(self) -> Tuple[None, dict]: # Load slots of this class and its parents (mangling if appropriate) obj_slots = {slot: getattr(self, slot) for slot in _all_slots(self.__class__)} # Replace temp file path with actual content diff --git a/usp/web_client/requests_client.py b/usp/web_client/requests_client.py index c27f58d..0719afb 100644 --- a/usp/web_client/requests_client.py +++ b/usp/web_client/requests_client.py @@ -43,7 +43,7 @@ def status_code(self) -> int: def status_message(self) -> str: message = self.__requests_response.reason if not message: - message = HTTPStatus(self.status_code(), None).phrase + message = HTTPStatus(self.status_code()).phrase return message def header(self, case_insensitive_name: str) -> Optional[str]: From 0b9258acb2bbd9e39ea85c51bfa9b0198a14c930 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Mon, 16 Dec 2024 13:23:59 +0000 Subject: [PATCH 63/79] update README --- README.rst | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/README.rst b/README.rst index cd85807..c8a00b7 100644 --- a/README.rst +++ b/README.rst @@ -6,10 +6,6 @@ :target: https://ultimate-sitemap-parser.readthedocs.io/en/latest/?badge=latest :alt: Documentation Status -.. image:: https://coveralls.io/repos/github/mediacloud/ultimate-sitemap-parser/badge.svg?branch=develop - :target: https://coveralls.io/github/mediacloud/ultimate-sitemap-parser?branch=develop - :alt: Coverage Status - .. image:: https://badge.fury.io/py/ultimate-sitemap-parser.svg :target: https://badge.fury.io/py/ultimate-sitemap-parser :alt: PyPI package @@ -23,8 +19,7 @@ :alt: Download stats -Website sitemap parser for Python 3.5+. - +Ultimate Sitemap Parser (USP) is a performant and robust Python library for parsing and crawling sitemaps. Features ======== @@ -69,18 +64,13 @@ Usage from usp.tree import sitemap_tree_for_homepage - tree = sitemap_tree_for_homepage('https://www.nytimes.com/') - print(tree) - -``sitemap_tree_for_homepage()`` will return a tree of ``AbstractSitemap`` subclass objects that represent the sitemap -hierarchy found on the website; see a `reference of AbstractSitemap subclasses `_. + tree = sitemap_tree_for_homepage('https://www.example.org/') -If you'd like to just list all the pages found in all of the sitemaps within the website, consider using ``all_pages()`` method: + for page in tree.all_pages(): + print(page.url) -.. code:: python +``sitemap_tree_for_homepage()`` will return a tree of ``AbstractSitemap`` subclass objects that represent the sitemap +hierarchy found on the website; see a `reference of AbstractSitemap subclasses `_. `AbstractSitemap.all_pages()` returns a generator to efficiently iterate over pages without loading the entire tree into memory. - # all_pages() returns an Iterator - for page in tree.all_pages(): - print(page) +For more examples and details, see the `documentation `_. -``all_pages()`` method will return an iterator yielding ``SitemapPage`` objects; see a `reference of SitemapPage `_. From 39cc4c5b7d4d6d2749832a1d99501e88e814b9f7 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Mon, 16 Dec 2024 13:28:46 +0000 Subject: [PATCH 64/79] update README badges [skip ci] --- README.rst | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/README.rst b/README.rst index c8a00b7..392bc76 100644 --- a/README.rst +++ b/README.rst @@ -1,22 +1,14 @@ -.. image:: https://travis-ci.org/mediacloud/ultimate-sitemap-parser.svg?branch=develop - :target: https://travis-ci.org/mediacloud/ultimate-sitemap-parser - :alt: Build Status +.. image:: https://img.shields.io/pypi/pyversions/ultimate-sitemap-parser + :alt: PyPI - Python Version -.. image:: https://readthedocs.org/projects/ultimate-sitemap-parser/badge/?version=latest - :target: https://ultimate-sitemap-parser.readthedocs.io/en/latest/?badge=latest - :alt: Documentation Status +.. image:: https://img.shields.io/pypi/v/ultimate-sitemap-parser + :alt: PyPI - Version -.. image:: https://badge.fury.io/py/ultimate-sitemap-parser.svg - :target: https://badge.fury.io/py/ultimate-sitemap-parser - :alt: PyPI package +.. image:: https://img.shields.io/conda/vn/conda-forge/ultimate-sitemap-parser + :alt: Conda Version -.. image:: https://img.shields.io/conda/v/conda-forge/ultimate-sitemap-parser?color=brightgreen - :target: https://anaconda.org/conda-forge/ultimate-sitemap-parser - :alt: Conda - -.. image:: https://pepy.tech/badge/ultimate-sitemap-parser - :target: https://pepy.tech/project/ultimate-sitemap-parser - :alt: Download stats +.. image:: https://img.shields.io/pepy/dt/ultimate-sitemap-parser + :alt: Pepy Total Downloads Ultimate Sitemap Parser (USP) is a performant and robust Python library for parsing and crawling sitemaps. From ce096b44075db7728718f798ddfb7b73942e1797 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Mon, 16 Dec 2024 13:34:17 +0000 Subject: [PATCH 65/79] Fix README badges [skip ci] --- README.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 392bc76..58eaa14 100644 --- a/README.rst +++ b/README.rst @@ -1,17 +1,24 @@ +# Ultimate Sitemap Parser + .. image:: https://img.shields.io/pypi/pyversions/ultimate-sitemap-parser :alt: PyPI - Python Version + :target: https://github.com/GateNLP/ultimate-sitemap-parser .. image:: https://img.shields.io/pypi/v/ultimate-sitemap-parser :alt: PyPI - Version + :target: https://pypi.org/project/ultimate-sitemap-parser/ .. image:: https://img.shields.io/conda/vn/conda-forge/ultimate-sitemap-parser :alt: Conda Version + :target: https://anaconda.org/conda-forge/ultimate-sitemap-parser .. image:: https://img.shields.io/pepy/dt/ultimate-sitemap-parser + :target: https://pepy.tech/project/ultimate-sitemap-parser :alt: Pepy Total Downloads -Ultimate Sitemap Parser (USP) is a performant and robust Python library for parsing and crawling sitemaps. +**Ultimate Sitemap Parser (USP) is a performant and robust Python library for parsing and crawling sitemaps.** + Features ======== @@ -65,4 +72,3 @@ Usage hierarchy found on the website; see a `reference of AbstractSitemap subclasses `_. `AbstractSitemap.all_pages()` returns a generator to efficiently iterate over pages without loading the entire tree into memory. For more examples and details, see the `documentation `_. - From c6047595539a9dba4a5b4e3be8edf03c07f5437a Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Mon, 16 Dec 2024 13:34:48 +0000 Subject: [PATCH 66/79] Fix README heading [skip ci] --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 58eaa14..2c34934 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,5 @@ -# Ultimate Sitemap Parser +Ultimate Sitemap Parser +----------------------- .. image:: https://img.shields.io/pypi/pyversions/ultimate-sitemap-parser :alt: PyPI - Python Version From d58d9e4d2bbb3cf7f24bf6ab64b2d2effade7991 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 17 Dec 2024 10:52:24 +0000 Subject: [PATCH 67/79] Fix integration tests (#46) --- .github/workflows/test_integration.yml | 8 +++----- tests/integration/download.py | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test_integration.yml b/.github/workflows/test_integration.yml index 8795011..902361f 100644 --- a/.github/workflows/test_integration.yml +++ b/.github/workflows/test_integration.yml @@ -1,4 +1,4 @@ -name: Test +name: Integration Test on: [workflow_dispatch] @@ -6,7 +6,7 @@ permissions: contents: read jobs: - test: + integ_test: runs-on: ubuntu-latest strategy: @@ -35,9 +35,7 @@ jobs: - name: Download cassettes run: poetry run python tests/integration/download.py -d - name: Run integration tests - run: poetry run pytest --integration --durations=0 \ - --junit-xml=$GITHUB_SHA.xml \ - tests/integration/test_integration.py + run: poetry run pytest --integration --durations=0 --junit-xml=integration.xml tests/integration/test_integration.py - name: Upload report uses: actions/upload-artifact@v4 with: diff --git a/tests/integration/download.py b/tests/integration/download.py index 3521179..66182bd 100644 --- a/tests/integration/download.py +++ b/tests/integration/download.py @@ -56,7 +56,7 @@ def dl_cassette(data): with requests.get(data["url"], allow_redirects=True, stream=True) as r: r.raise_for_status() - with open(dl_gz_path / dl_gz_path, "wb") as f: + with open(dl_gz_path, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) @@ -120,6 +120,7 @@ def cleanup_files(data, confirm=True): def main(force: bool = False, force_delete=False): logging.basicConfig(level=logging.INFO) + CASSETTE_ROOT.mkdir(exist_ok=True) (CASSETTE_ROOT / "download").mkdir(exist_ok=True) manifest = download_manifest() From dae9699401e8d44c2136b6377ad0895493df8706 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 17 Dec 2024 11:29:57 +0000 Subject: [PATCH 68/79] Add contributing guide --- docs/contributing.rst | 109 ++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + tests/integration/README.md | 51 ----------------- 3 files changed, 110 insertions(+), 51 deletions(-) create mode 100644 docs/contributing.rst delete mode 100644 tests/integration/README.md diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 0000000..22a6a0f --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1,109 @@ +Contributing +============ + +Get Started +----------- + +To install USP for development, you'll need `Poetry `_ to automatically manage the virtual environment. + +Fork and clone the repo, then run ``poetry install --with dev``. This will install all the dependencies and the package itself as an editable install. The remainder of this guide assumes you have activated the Poetry shell with ``poetry shell`` or will prefix each command with ``poetry run``. + +It's best practice to make an issue, or comment on an existing one, before working on your PR. + +Linting +------- + +We use `Ruff `_ to lint USP. This is done in two stages: + +.. code-block:: bash + + poetry run ruff check --fix + poetry run ruff format + +Testing +------- + +See the ``tests`` directory for automated tests written with Pytest. + +When contributing please make sure that: + +* any bugfixes include a test that fails without the fix +* any new functionality includes appropriate tests + +To run tests, use ``pytest`` or ``make test``. + +Integration Tests +----------------- + +Tests against real-world sitemaps are written using `VCR.py `_ to cache HTTP responses in *cassettes*. Due to the size of the cassettes, the cassette files are not included in this repository. + +.. caution:: + + Cassettes are generated by running a real scrape with the current version of USP. As such, changes to the HTTP client behaviour (such as modifying how requests are made or which paths are requested) may prevent integration tests from being run until cassettes are regenerated. Additionally, running integration tests against old versions is not necessarily supported. + +Downloading Cassettes +~~~~~~~~~~~~~~~~~~~~~ + +Cassettes are distributed from releases in a `separate repository `_. For an overview of available cassettes, see `the manifest file `_. + +To download and test against all cassettes, run: + +.. code-block:: bash + + python tests/integration/download.py + pytest --integration tests/integration + +Memory Profiling +~~~~~~~~~~~~~~~~ + +Ensure you have installed the extra ``perf`` dependency group with ``poetry install --with perf``. + +To profile memory during integration tests, run the test command with ``--memray``. + +.. code-block:: bash + + pytest --integration [--memray-bin-path memray] tests/integration --memray + +Without the ``--memray-bin-path`` argument, this will measure memory usage and report at the end of the test run. +With the argument, it will output the memory usage reports to the *memray* directory, which can then be used to generate reports e.g. `a flamegraph `_. + +Performance Profiling +~~~~~~~~~~~~~~~~~~~~~ + +Ensure you have installed the extra ``perf`` dependency group with ``poetry install --with perf``. + +To profile performance during tests, run through the pyinstrument CLI: + +.. code-block:: bash + + pyinstrument -m pytest --integration tests/integration + +Pyinstrument does not distinguish between tests, so you may want to filter to a specific test at a time with ``-k``. For example, to only run the bbc.co.uk test: + +.. code-block:: bash + + pyinstrument -m pytest --integration -k bbc tests/integration + +This can be viewed as an interactive HTML report by passing ``-r html`` to ``pyinstrument`` initially, or using the ``--load-prev`` command output at the end of the test run. + +Documentation +------------- + +This documentation is built with Sphinx. + +To build documentation, install the extra ``docs`` dependency group with ``poetry install --with docs``, then: + +.. code-block:: bash + + cd docs + make livehtml + +This will start a live build of the docs at ``http://localhost:8000``. + +Read the Docs will build a preview version when you make a PR. + +You may need to update the reference documentation with your changes: + +* The public interface is documented in ``docs/reference/api`` +* The CLI interface is documented in ``docs/reference/cli.rst`` +* Supported sitemap formats are documented in ``docs/reference/formats.rst`` with examples in ``docs/reference/formats_examples``. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 8b3dccf..44f599a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -32,6 +32,7 @@ Ultimate Sitemap Parser changelog acknowledgements + contributing GitHub PyPI Issues diff --git a/tests/integration/README.md b/tests/integration/README.md deleted file mode 100644 index 63375f8..0000000 --- a/tests/integration/README.md +++ /dev/null @@ -1,51 +0,0 @@ -# Integration & Performance Tests - -These tests use [VCR.py](https://vcrpy.readthedocs.io/) cassettes to avoid making real HTTP requests. Due to the size of the cassettes, they are not included in this repository. - -## Downloading Cassettes - -Cassettes are distributed from releases in a [separate repository](https://github.com/GateNLP/usp-test-cassettes). For an overview of available cassettes, see [the manifest file](https://github.com/GateNLP/usp-test-cassettes/blob/main/manifest.json). - -Run `python3 download.py` to download and decompress all available cassettes into the `cassettes` directory. - -Some cassette files are quite large when decompressed (~400MB) but compress relatively efficiently (~30MB). - -> [!IMPORTANT] -> In USP's tests, VCR.py is configured to run in `none` record mode (HTTP requests not included in the cassette will cause failure). -> This means that code changes causing new HTTP requests will temporarily break performance tests until the cassettes can be updated. - -## Running Tests - -Integration tests must be manually enabled with the `--integration` flag. - -```bash -pytest --integration tests/integration -``` - -## Memory Profiling with Memray - -To profile memory usage during tests, run the test command with the `--memray` - -```bash -pytest --memray [--memray-bin-path memray] --integration tests/integration -``` - -Without the --memray-bin-path argument, this will measure memory usage and report at the end of the test run. -With the argument, it will output the memory usage reports to the `memray` directory, which can then be used to generate reports e.g. [a flamegraph](https://bloomberg.github.io/memray/flamegraph.html). - - -## Performance Profiling with Pyinstrument - -To profile performance during tests, run through the pyinstrument CLI: - -```bash -pyinstrument -m pytest --integration tests/integration -``` - -Pyinstrument does not distinguish between tests, so you may want to filter to a specific test at a time with -k. For example, to only run the bbc.co.uk test: - -```bash -pyinstrument -m pytest --integration -k bbc tests/integration -``` - -This can be viewed as an interactive HTML report by passing `-r html` to `pyinstrument` initially, or using the `--load-prev` command output at the end of the test run. \ No newline at end of file From 5942dacee03e64b7e7fab6beb2196b2886e76721 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 17 Dec 2024 12:13:10 +0000 Subject: [PATCH 69/79] Combine v1.0 changelog --- docs/changelog.rst | 62 +++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index ee517d0..027f325 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -6,52 +6,46 @@ v1.0.0 (upcoming) **New Features** -- CLI tool to parse and list sitemaps on the command line (see :doc:`/reference/cli`) -- All sitemap objects now implement a consistent interface, allowing traversal of the tree irrespective of type: - - All sitemaps now have ``pages`` and ``sub_sitemaps`` properties, returning their children of that type, or an empty list where not applicable - - Added ``all_sitemaps()`` method to iterate over all descendant sitemaps -- Pickling page sitemaps now includes page data, which previously was not included as it was swapped to disk -- Sitemaps and pages now implement ``to_dict()`` method to convert to dictionaries -- Added optional arguments to ``usp.tree.sitemap_tree_for_homepage()`` to disable robots.txt-based or known-path-based sitemap discovery. Default behaviour is still to use both. -- Parse sitemaps from a string with :ref:`local parse` -- Support for the Google Image sitemap extension +* CLI tool to parse and list sitemaps on the command line (see :doc:`/reference/cli`) +* All sitemap objects now implement a consistent interface, allowing traversal of the tree irrespective of type: + + * All sitemaps now have ``pages`` and ``sub_sitemaps`` properties, returning their children of that type, or an empty list where not applicable + * Added ``all_sitemaps()`` method to iterate over all descendant sitemaps + +* Pickling page sitemaps now includes page data, which previously was not included as it was swapped to disk +* Sitemaps and pages now implement ``to_dict()`` method to convert to dictionaries +* Added optional arguments to ``usp.tree.sitemap_tree_for_homepage()`` to disable robots.txt-based or known-path-based sitemap discovery. Default behaviour is still to use both. +* Parse sitemaps from a string with :ref:`local parse` +* Support for the Google Image sitemap extension +* Add proxy support with ``RequestsWebClient.set_proxies()`` (:pr:`20` by :user:`tgrandje`) +* Add additional sitemap discovery paths for news sitemaps (:commit:`d3bdaae56be87c97ce2f3f845087f495f6439b44`) +* Add parameter to ``RequestsWebClient.__init__()`` to disable certificate verification (:pr:`37` by :user:`japherwocky`) **Performance** Improvement of parse performance by approximately 90%: -- Optimised lookup of page URLs when checking if duplicate -- Optimised datetime parse in XML Sitemaps by trying full ISO8601 parsers before the general parser +* Optimised lookup of page URLs when checking if duplicate +* Optimised datetime parse in XML Sitemaps by trying full ISO8601 parsers before the general parser **Bug Fixes** -- Invalid datetimes will be parsed as ``None`` instead of crashing (reported in :issue:`22`, :issue:`31`) -- Invalid priorities will be set to the default (0.5) instead of crashing -- Moved ``__version__`` attribute into main class module -- Robots.txt index sitemaps now count for the max recursion depth (reported in :issue:`29`). The default maximum has been increased by 1 to compensate for this. - -v0.6 (upcoming) ---------------- - -**New Features** - -- Add proxy support with ``RequestsWebClient.set_proxies()`` (:pr:`20` by :user:`tgrandje`) -- Add additional sitemap discovery paths for news sitemaps (:commit:`d3bdaae56be87c97ce2f3f845087f495f6439b44`) -- Add parameter to ``RequestsWebClient.__init__()`` to disable certificate verification (:pr:`37` by :user:`japherwocky`) - -**Bug Fixes** +* Invalid datetimes will be parsed as ``None`` instead of crashing (reported in :issue:`22`, :issue:`31`) +* Invalid priorities will be set to the default (0.5) instead of crashing +* Moved ``__version__`` attribute into main class module +* Robots.txt index sitemaps now count for the max recursion depth (reported in :issue:`29`). The default maximum has been increased by 1 to compensate for this. +* Remove log configuration so it can be specified at application level (:pr:`24` by :user:`dsoprea`) +* Resolve warnings caused by :external+python:class:`http.HTTPStatus` usage (:commit:`3867b6e`) +* Don't add ``InvalidSitemap`` object if ``robots.txt`` is not found (:pr:`39` by :user:`gbenson`) +* Fix incorrect lowercasing of URLS discovered in robots.txt (:pr:`35`) -- Remove log configuration so it can be specified at application level (:pr:`24` by :user:`dsoprea`) -- Resolve warnings caused by :external+python:class:`http.HTTPStatus` usage (:commit:`3867b6e`) -- Don't add ``InvalidSitemap`` object if ``robots.txt`` is not found (:pr:`39` by :user:`gbenson`) -- Fix incorrect lowercasing of URLS discovered in robots.txt (:pr:`35`) Prior versions -------------- For versions prior to 1.0, no changelog is available. Use the release tags to compare versions: -- `0.4...0.5 `__ -- `0.3...0.4 `__ -- `0.2...0.3 `__ -- `0.1...0.2 `__ +* `0.4...0.5 `__ +* `0.3...0.4 `__ +* `0.2...0.3 `__ +* `0.1...0.2 `__ From 182aa96ab7e70a6ba7e8984f78a23ee773372420 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Tue, 17 Dec 2024 12:36:33 +0000 Subject: [PATCH 70/79] Fix dark mode diagram issues --- docs/_static/css/custom.css | 12 ++++-------- docs/extensions/custom_graphviz.py | 30 +++++++++++++++--------------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css index cef1a5b..b0edb15 100644 --- a/docs/_static/css/custom.css +++ b/docs/_static/css/custom.css @@ -2,20 +2,16 @@ svg.graphviz { max-width: 100% !important; height: auto !important; + + --graphviz-font-color: var(--color-content-foreground); } svg.graphviz .node a { text-decoration: none; - fill: var(--color-link); -} - -svg.graphviz .node a:hover { - fill: var(--pst-color-link-hover); - text-decoration: underline; + --graphviz-font-color: var(--color-link); } /* Make Rubric more like sphinx-book-theme */ - p.rubric { font-size: 1em; border-bottom: 1px solid var(--color-background-border); @@ -27,7 +23,7 @@ p.rubric { .code-card .sd-card-body { padding: 0; border-radius: 0 0 0.25rem 0.25rem; - background-color: #f0f0f0 !important; + /*background-color: var(--sd-color-card-background) !important;*/ } .code-card .sd-card-body>div { diff --git a/docs/extensions/custom_graphviz.py b/docs/extensions/custom_graphviz.py index 6f084fe..d1a62ab 100644 --- a/docs/extensions/custom_graphviz.py +++ b/docs/extensions/custom_graphviz.py @@ -334,8 +334,8 @@ def replace_var_in_code(m: re.Match) -> str: replacement_color = replace_var(var_text) return f'"{replacement_color}"' - # fontcolor = replace_var("var(--pst-color-link)") - # fontsize = "12" + fontcolor = replace_var("var(--graphviz-font-color)") + fontsize = "12" graphviz_dot = options.get("graphviz_dot", self.builder.config.graphviz_dot) config_info = get_adjusted_graphviz_config(self.builder.app, graphviz_dot) @@ -344,18 +344,18 @@ def replace_var_in_code(m: re.Match) -> str: # ttf_font = font command_line_options = [ - # "-Ncolor=" + replace_var("var(--md-graphviz-node-fg-color)"), - # "-Nstyle=solid,filled", - # "-Nfillcolor=" + replace_var("var(--md-graphviz-node-bg-color)"), - # "-Nfontcolor=" + fontcolor, - # "-Nfontsize=" + fontsize, - # "-Ecolor=" + replace_var("var(--md-graphviz-edge-color)"), - # "-Efontcolor=" + fontcolor, - # "-Efontsize=" + fontsize, - # "-Gbgcolor=transparent", - # "-Gcolor=" + replace_var("var(--md-graphviz-node-fg-color)"), - # "-Gfontcolor=" + fontcolor, - # "-Gfontsize=" + fontsize, + "-Ncolor=" + replace_var("var(--color-content-foreground)"), + "-Nstyle=solid,filled", + "-Nfillcolor=" + replace_var("var(--color-content-background)"), + "-Nfontcolor=" + fontcolor, + "-Nfontsize=" + fontsize, + "-Ecolor=" + replace_var("var(--color-content-foreground)"), + "-Efontcolor=" + fontcolor, + "-Efontsize=" + fontsize, + "-Gbgcolor=transparent", + "-Gcolor=" + replace_var("var(--color-content-foreground)"), + "-Gfontcolor=" + fontcolor, + "-Gfontsize=" + fontsize, ] # if ttf_font is not None: # command_line_options.extend( @@ -449,7 +449,7 @@ def replace_var_in_code(m: re.Match) -> str: child.attrib["within_a"] = "true" within_a = attrib.pop("within_a", None) # if within_a: - # style += "--pst-color-link-hover: var(--pst-color-link-hover);" + # style += "--graphviz-hover-color: var(--pst-color-link-hover);" if style: attrib["style"] = style From f4f6b12e59c6cb468b1765cbe8a797dc306ae183 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 18 Dec 2024 10:40:26 +0000 Subject: [PATCH 71/79] Update NOTICE --- NOTICE | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NOTICE b/NOTICE index e4fb39c..2dde155 100644 --- a/NOTICE +++ b/NOTICE @@ -1,4 +1,5 @@ -Copyright (C) 2018 Linas Valiukas, Hal Roberts, Media Cloud project +Copyright (C) 2018 Linas Valiukas, Hal Roberts, Media Cloud project, + Freddy Heppell, The University of Sheffield, and other contributors. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by From b70cb3662a73652c6fed6acb30ba230cb7481844 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 18 Dec 2024 10:49:03 +0000 Subject: [PATCH 72/79] Update package config --- pyproject.toml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9a7a339..b8fc37f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,19 @@ [tool.poetry] name = "ultimate-sitemap-parser" -version = "0.6.0" +version = "1.0.0rc1" description = "Ult" authors = [ "Linas Valiukas ", - "Hal Roberts " + "Hal Roberts ", + "Freddy Heppell " +] +maintainers = [ + "Freddy Heppell " ] license = "GPL-3.0-or-later" readme = "README.rst" classifiers=[ - 'Development Status :: 3 - Alpha', + 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'Intended Audience :: Information Technology', 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', From c2eadfd6dbc91081d01cf84addb2726935ebbd29 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 18 Dec 2024 10:52:05 +0000 Subject: [PATCH 73/79] Update sitemap formats in README --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 2c34934..c5cf5f0 100644 --- a/README.rst +++ b/README.rst @@ -27,7 +27,7 @@ Features - Supports all sitemap formats: - `XML sitemaps `_ - - `Google News sitemaps `_ + - `Google News sitemaps `_ and `Image sitemaps `_ - `plain text sitemaps `_ - `RSS 2.0 / Atom 0.3 / Atom 1.0 sitemaps `_ - `Sitemaps linked from robots.txt `_ From 188ca49a86a02569a28e812322304296c39c93b6 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 18 Dec 2024 11:05:07 +0000 Subject: [PATCH 74/79] Add testpypi workflow --- .github/workflows/publish.yml | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 35eb448..4064500 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -47,6 +47,7 @@ jobs: path: dist/ - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 + github-release: name: GitHub release needs: @@ -86,4 +87,25 @@ jobs: run: >- gh release upload '${{ github.ref_name }}' dist/** - --repo '${{ github.repository }}' \ No newline at end of file + --repo '${{ github.repository }}' + + publish-to-testpypi: + name: Publish to TestPyPI + needs: + - build + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://test.pypi.org/p/ultimate-sitemap-parser + permissions: + id-token: write + steps: + - name: Download distribution packages + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ \ No newline at end of file From 5d5b6413b30e3222de9d60b8103a8f38ae09b67a Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 18 Dec 2024 11:07:37 +0000 Subject: [PATCH 75/79] Fix package description --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b8fc37f..5cb2a53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "ultimate-sitemap-parser" version = "1.0.0rc1" -description = "Ult" +description = "A performant library for parsing and crawling sitemaps" authors = [ "Linas Valiukas ", "Hal Roberts ", From b601092c84b3af758bdcfa6ecd9337876eab8678 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 18 Dec 2024 11:08:10 +0000 Subject: [PATCH 76/79] Bump to post1 for packaging test --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5cb2a53..ca4cef0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ultimate-sitemap-parser" -version = "1.0.0rc1" +version = "1.0.0rc1.post1" description = "A performant library for parsing and crawling sitemaps" authors = [ "Linas Valiukas ", From 0d14ade35d8d6e553dadca4cc2be427e69d4e91c Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 18 Dec 2024 11:11:48 +0000 Subject: [PATCH 77/79] Bump publish action versions --- .github/workflows/publish.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 4064500..bc67f81 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -25,7 +25,7 @@ jobs: - name: Build run: poetry build - name: Store distribution packages - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: python-package-distributions path: dist/ @@ -41,7 +41,7 @@ jobs: id-token: write steps: - name: Download distribution packages - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-package-distributions path: dist/ @@ -60,12 +60,12 @@ jobs: steps: - name: Download distribution packages - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-package-distributions path: dist/ - name: Sign the dists with Sigstore - uses: sigstore/gh-action-sigstore-python@v2.1.1 + uses: sigstore/gh-action-sigstore-python@v3.0.0 with: inputs: >- ./dist/*.tar.gz From 82f7bbcb2adbee43a90d51dcef05fce36db0db0f Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 18 Dec 2024 11:21:39 +0000 Subject: [PATCH 78/79] Update package metadata --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index ca4cef0..15277d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,9 @@ authors = [ maintainers = [ "Freddy Heppell " ] +homepage = "https://ultimate-sitemap-parser.readthedocs.io/" +documentation = "https://ultimate-sitemap-parser.readthedocs.io/" +repository = "https://github.com/GateNLP/ultimate-sitemap-parser" license = "GPL-3.0-or-later" readme = "README.rst" classifiers=[ @@ -23,6 +26,7 @@ classifiers=[ 'Topic :: Text Processing :: Indexing', 'Topic :: Text Processing :: Markup :: XML', ] +keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"] packages = [ { include = "usp" } ] From f283ea18da8cfe37a39b3085a823d92149813028 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 18 Dec 2024 11:26:21 +0000 Subject: [PATCH 79/79] Prepare for 1.0.0rc1 public release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 15277d8..78389e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ultimate-sitemap-parser" -version = "1.0.0rc1.post1" +version = "1.0.0rc1" description = "A performant library for parsing and crawling sitemaps" authors = [ "Linas Valiukas ",