Skip to content

Commit

Permalink
Add request wait (#57)
Browse files Browse the repository at this point in the history
* add request waiting

* Add wait and random wait
  • Loading branch information
freddyheppell authored Jan 20, 2025
1 parent ebef222 commit f64ca64
Show file tree
Hide file tree
Showing 6 changed files with 353 additions and 150 deletions.
1 change: 1 addition & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Upcoming

* Added support for :ref:`alternate localised pages <sitemap-extra-localisation>` with ``hreflang``.
* If an HTTP error is encountered, the contents of the error page is logged at ``INFO`` level.
* Added optional configurable wait time to HTTP request client.

v1.0.0 (2025-01-13)
-------------------
Expand Down
424 changes: 276 additions & 148 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ requests-mock = ">=1.6.0,<2.0"
pytest = "^8.3.0"
ruff = "^0.6.1"
vcrpy = "6.0.1"
pytest-mock = "^3.14.0"

[tool.poetry.group.perf]
optional = true
Expand Down
25 changes: 25 additions & 0 deletions tests/web_client/test_requests_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,28 @@ def test_error_page_log(self, client, requests_mock, caplog):
client.get(test_url)

assert "Response content: This page is broken." in caplog.text

@pytest.fixture
def mocked_sleep(self, mocker):
return mocker.patch("usp.web_client.abstract_client.time.sleep")

def test_no_request_wait(self, mocked_sleep):
client = RequestsWebClient()
client.get(self.TEST_BASE_URL + "/page1.html")
client.get(self.TEST_BASE_URL + "/page2.html")
mocked_sleep.assert_not_called()

def test_request_wait(self, mocked_sleep):
client = RequestsWebClient(wait=1)
client.get(self.TEST_BASE_URL + "/page1.html")
mocked_sleep.assert_not_called()
client.get(self.TEST_BASE_URL + "/page2.html")
mocked_sleep.assert_called_once_with(1)

def test_request_wait_random(self, mocked_sleep):
client = RequestsWebClient(wait=1, random_wait=True)
client.get(self.TEST_BASE_URL + "/page1.html")
client.get(self.TEST_BASE_URL + "/page2.html")
mocked_sleep.assert_called_once()
assert 0.5 <= mocked_sleep.call_args[0][0] <= 1.5
assert mocked_sleep.call_args[0][0] != 1
35 changes: 35 additions & 0 deletions usp/web_client/abstract_client.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Abstract web client class."""

import abc
import random
from http import HTTPStatus
import time
from typing import Optional

RETRYABLE_HTTP_STATUS_CODES = {
Expand Down Expand Up @@ -187,3 +189,36 @@ def set_max_response_data_length(

def get(self, url: str) -> AbstractWebClientResponse:
raise NoWebClientException


class RequestWaiter:
"""
Manages waiting between requests.
"""

def __init__(self, wait: Optional[float] = None, random_wait: bool = True):
"""
:param wait: time to wait between requests, in seconds.
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
"""
self.wait_s = wait or 0
self.random_wait = random_wait
self.is_first = True

def wait(self) -> None:
"""Perform a wait if needed. Should be called before each request.
Will skip wait if this is the first request.
"""
if self.wait_s == 0:
return

if self.is_first:
self.is_first = False
return

wait_f = 1.0
if self.random_wait:
wait_f = random.uniform(0.5, 1.5)

time.sleep(self.wait_s * wait_f)
17 changes: 15 additions & 2 deletions usp/web_client/requests_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
AbstractWebClient,
AbstractWebClientResponse,
AbstractWebClientSuccessResponse,
RequestWaiter,
WebClientErrorResponse,
RETRYABLE_HTTP_STATUS_CODES,
)
Expand Down Expand Up @@ -79,16 +80,27 @@ class RequestsWebClient(AbstractWebClient):
Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
"""

__slots__ = ["__max_response_data_length", "__timeout", "__proxies", "__verify"]
__slots__ = [
"__max_response_data_length",
"__timeout",
"__proxies",
"__verify",
"__waiter",
]

def __init__(self, verify=True):
def __init__(
self, verify=True, wait: Optional[float] = None, random_wait: bool = False
):
"""
:param verify: whether certificates should be verified for HTTPS requests.
:param wait: time to wait between requests, in seconds.
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
"""
self.__max_response_data_length = None
self.__timeout = self.__HTTP_REQUEST_TIMEOUT
self.__proxies = {}
self.__verify = verify
self.__waiter = RequestWaiter(wait, random_wait)

def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
"""Set HTTP request timeout.
Expand All @@ -115,6 +127,7 @@ def set_max_response_data_length(self, max_response_data_length: int) -> None:
self.__max_response_data_length = max_response_data_length

def get(self, url: str) -> AbstractWebClientResponse:
self.__waiter.wait()
try:
response = requests.get(
url,
Expand Down

0 comments on commit f64ca64

Please sign in to comment.