Skip to content

Commit cbb9c14

Browse files
lion-szMr0grog
andauthored
Ratelimits (#79)
Add rate limiting to `search` (not just `get_memento`) and make the limits configurable in the `WaybackSession` constructor. Fixes #12. Co-authored-by: Rob Brackett <[email protected]>
1 parent 81bff09 commit cbb9c14

File tree

6 files changed

+6626
-31
lines changed

6 files changed

+6626
-31
lines changed

docs/source/release-history.rst

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,34 @@ While we were at it, we renamed the ``datetime`` parameter of :meth:`wayback.Way
3030
Features
3131
^^^^^^^^
3232

33-
:attr:`wayback.Memento.headers` is now case-insensitive. The keys of the ``headers`` dict are returned with their original case when iterating, but lookups are performed case-insensitively. For example::
33+
- :attr:`wayback.Memento.headers` is now case-insensitive. The keys of the ``headers`` dict are returned with their original case when iterating, but lookups are performed case-insensitively. For example::
3434

35-
list(memento.headers) == ['Content-Type', 'Date']
36-
memento.headers['Content-Type'] == memento.headers['content-type']
35+
list(memento.headers) == ['Content-Type', 'Date']
36+
memento.headers['Content-Type'] == memento.headers['content-type']
3737

38-
(:issue:`98`)
38+
(:issue:`98`)
39+
40+
- There are now built-in rate limits for calls to ``search()`` and ``get_memento()``. The default values should keep you from getting temporarily blocked by the Wayback Machine servers, but you can also adjust them when instantiating :class:`wayback.WaybackSession`:
41+
42+
.. code-block:: python
43+
44+
# Limit get_memento() calls to 2 per second (or one every 0.5 seconds):
45+
client = WaybackClient(WaybackSession(memento_calls_per_second=2))
46+
47+
# These now take a minimum of 0.5 seconds, even if the Wayback Machine
48+
# responds instantly (there's no delay on the first call):
49+
client.get_memento('http://www.noaa.gov/', timestamp='20180816111911')
50+
client.get_memento('http://www.noaa.gov/', timestamp='20180829092926')
51+
52+
A huge thanks to @LionSzl for implementing this. (:issue:`12`)
3953

4054

4155
Fixes & Maintenance
4256
^^^^^^^^^^^^^^^^^^^
4357

44-
All API requests to archive.org now use HTTPS instead of HTTP. Thanks to @sundhaug92 for calling this out. (:issue:`81`)
58+
- All API requests to archive.org now use HTTPS instead of HTTP. Thanks to @sundhaug92 for calling this out. (:issue:`81`)
4559

46-
Headers from the original archived response are again included in :attr:`wayback.Memento.headers`. As part of this, the ``headers`` attribute is now case-insensitive (see new features above), since the Internet Archive servers now return headers with different cases depending on how the request was made. (:issue:`98`)
60+
- Headers from the original archived response are again included in :attr:`wayback.Memento.headers`. As part of this, the ``headers`` attribute is now case-insensitive (see new features above), since the Internet Archive servers now return headers with different cases depending on how the request was made. (:issue:`98`)
4761

4862

4963
v0.3.3 (2022-09-30)

wayback/_client.py

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -187,12 +187,9 @@ def from_httplib(cls, httplib_response, **response_kwargs):
187187
#####################################################################
188188

189189

190-
# TODO: make rate limiting configurable at the session level, rather than
191-
# arbitrarily set inside get_memento(). Idea: have a rate limit lock type and
192-
# pass an instance to the constructor here.
193190
class WaybackSession(_utils.DisableAfterCloseSession, requests.Session):
194191
"""
195-
A custom session object that network pools connections and resources for
192+
A custom session object that pools network connections and resources for
196193
requests to the Wayback Machine.
197194
198195
Parameters
@@ -215,6 +212,12 @@ class WaybackSession(_utils.DisableAfterCloseSession, requests.Session):
215212
user_agent : str, optional
216213
A custom user-agent string to use in all requests. Defaults to:
217214
`wayback/{version} (+https://github.com/edgi-govdata-archiving/wayback)`
215+
search_calls_per_second : int or float, default: 1.5
216+
The maximum number of calls made to the search API per second.
217+
To disable the rate limit, set this to 0.
218+
memento_calls_per_second : int or float, default: 30
219+
The maximum number of calls made to the memento API per second.
220+
To disable the rate limit, set this to 0.
218221
"""
219222

220223
# It seems Wayback sometimes produces 500 errors for transient issues, so
@@ -227,7 +230,8 @@ class WaybackSession(_utils.DisableAfterCloseSession, requests.Session):
227230
# just the error type. See `should_retry_error()`.
228231
handleable_errors = (ConnectionError,) + retryable_errors
229232

230-
def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None):
233+
def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None,
234+
search_calls_per_second=1.5, memento_calls_per_second=30):
231235
super().__init__()
232236
self.retries = retries
233237
self.backoff = backoff
@@ -237,6 +241,8 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None):
237241
f'wayback/{__version__} (+https://github.com/edgi-govdata-archiving/wayback)'),
238242
'Accept-Encoding': 'gzip, deflate'
239243
}
244+
self.search_calls_per_second = search_calls_per_second
245+
self.memento_calls_per_second = memento_calls_per_second
240246
# NOTE: the nice way to accomplish retry/backoff is with a urllib3:
241247
# adapter = requests.adapters.HTTPAdapter(
242248
# max_retries=Retry(total=5, backoff_factor=2,
@@ -530,24 +536,26 @@ def search(self, url, *, match_type=None, limit=1000, offset=None,
530536
previous_result = None
531537
while next_query:
532538
sent_query, next_query = next_query, None
533-
response = self.session.request('GET', CDX_SEARCH_URL,
534-
params=sent_query)
535-
try:
536-
# Read/cache the response and close straightaway. If we need to
537-
# raise for status, we want to pre-emptively close the response
538-
# so a user handling the error doesn't need to worry about it. If
539-
# we don't raise here, we still want to close the connection so it
540-
# doesn't leak when we move onto the next of results or when this
541-
# iterator ends.
542-
read_and_close(response)
543-
response.raise_for_status()
544-
except requests.exceptions.HTTPError as error:
545-
if 'AdministrativeAccessControlException' in response.text:
546-
raise BlockedSiteError(query['url'])
547-
elif 'RobotAccessControlException' in response.text:
548-
raise BlockedByRobotsError(query['url'])
549-
else:
550-
raise WaybackException(str(error))
539+
with _utils.rate_limited(self.session.search_calls_per_second,
540+
group='search'):
541+
response = self.session.request('GET', CDX_SEARCH_URL,
542+
params=sent_query)
543+
try:
544+
# Read/cache the response and close straightaway. If we need
545+
# to raise for status, we want to pre-emptively close the
546+
# response so a user handling the error doesn't need to
547+
# worry about it. If we don't raise here, we still want to
548+
# close the connection so it doesn't leak when we move onto
549+
# the next of results or when this iterator ends.
550+
read_and_close(response)
551+
response.raise_for_status()
552+
except requests.exceptions.HTTPError as error:
553+
if 'AdministrativeAccessControlException' in response.text:
554+
raise BlockedSiteError(query['url'])
555+
elif 'RobotAccessControlException' in response.text:
556+
raise BlockedByRobotsError(query['url'])
557+
else:
558+
raise WaybackException(str(error))
551559

552560
lines = iter(response.content.splitlines())
553561

@@ -718,7 +726,8 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
718726
mode=mode,
719727
url=original_url)
720728

721-
with _utils.rate_limited(calls_per_second=30, group='get_memento'):
729+
with _utils.rate_limited(calls_per_second=self.session.memento_calls_per_second,
730+
group='get_memento'):
722731
# Correctly following redirects is actually pretty complicated. In
723732
# the simplest case, a memento is a simple web page, and that's
724733
# no problem. However...

0 commit comments

Comments
 (0)