Skip to content

Commit

Permalink
Ensure retries are logged
Browse files Browse the repository at this point in the history
  • Loading branch information
rohanpm committed Jun 14, 2019
1 parent 4f0fbcf commit 924600d
Show file tree
Hide file tree
Showing 8 changed files with 119 additions and 14 deletions.
1 change: 1 addition & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,5 @@ disable=print-statement,
protected-access,
# we use black code style and don't need additional checks
bad-continuation,
line-too-long,

1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Task error_summary and error_details are now initialized appropriately
with data from Pulp
- Client now logs Pulp load every few minutes when awaiting tasks
- Client now logs warnings when Pulp operations are being retried

## [0.1.1] - 2019-06-13

Expand Down
23 changes: 23 additions & 0 deletions docs/logging.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,26 @@ These messages will include an ``event`` attribute of the form:
type: awaiting-pulp
running-tasks: <count>
waiting-tasks: <count>
Retrying
--------

Many methods in this library will implicitly retry failing operations a few times.
When this occurs, a ``WARNING`` message is logged before the retry occurs, as
in the following examples:

.. code-block::
[WARNING] Retrying due to error: 401 Client Error: Unauthorized for url: https://pulp.example.com/pulp/api/v2/repositories/search/ [1/10]
[WARNING] Retrying due to error: Task e239ae4f-7fad-4004-bfb6-8e06f17d22ef failed [3/10]
The ``[1/10]`` indicator shows the current attempt at the operation and the maximum
number of attempts before the error will be considered fatal.

These messages will include an ``event`` attribute of the form:

.. code-block:: yaml
event:
type: pulp-retry
2 changes: 1 addition & 1 deletion pubtools/pulplib/_impl/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class Client(object):
_REQUEST_THREADS = int(os.environ.get("PUBTOOLS_PULPLIB_REQUEST_THREADS", "4"))
_PAGE_SIZE = int(os.environ.get("PUBTOOLS_PULPLIB_PAGE_SIZE", "2000"))
_TASK_THROTTLE = int(os.environ.get("PUBTOOLS_PULPLIB_TASK_THROTTLE", "500"))
_RETRY_POLICY = lambda *_: retry.new_policy()
_RETRY_POLICY = lambda *_: retry.PulpRetryPolicy()

def __init__(self, url, **kwargs):
"""
Expand Down
68 changes: 63 additions & 5 deletions pubtools/pulplib/_impl/client/retry.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,65 @@
from more_executors.retry import ExceptionRetryPolicy
import os
import logging
import traceback

from more_executors.retry import RetryPolicy, ExceptionRetryPolicy

def new_policy():
# TODO: we'll need our own policy with improved logging and handling of
# certain "no point in retrying" errors
return ExceptionRetryPolicy()
from .errors import TaskFailedException

LOG = logging.getLogger("pubtools.pulplib")

ATTEMPTS = int(os.environ.get("PUBTOOLS_PULPLIB_RETRY_ATTEMPTS", "10"))
SLEEP = float(os.environ.get("PUBTOOLS_PULPLIB_RETRY_SLEEP", "1.0"))
MAX_SLEEP = float(os.environ.get("PUBTOOLS_PULPLIB_RETRY_MAX_SLEEP", "120.0"))


class PulpRetryPolicy(RetryPolicy):
def __init__(self, max_attempts=ATTEMPTS, sleep=SLEEP, max_sleep=MAX_SLEEP):
super(PulpRetryPolicy, self).__init__()
self._max_attempts = max_attempts
self._delegate = ExceptionRetryPolicy(
max_attempts=max_attempts, sleep=sleep, max_sleep=max_sleep
)

def should_retry(self, attempt, future):
# This is still using only the default ExceptionRetryPolicy behavior.
# I expect at some point we will need to fine-tune this and look
# at the error in more detail in order to decide whether to retry.
retry = self._delegate.should_retry(attempt, future)

if future.exception() and retry:
self._log_retry(attempt, future)

return retry

def sleep_time(self, attempt, future):
return self._delegate.sleep_time(attempt, future)

def _log_retry(self, attempt, future):
# TODO: it would be nice if we could add some info here about
# *what* is being retried. But it's a bit of a challenge since
# future objects themselves don't come with any info on what was
# done to create the future.
exception = future.exception()
LOG.warning(
"Retrying due to error: %s [%d/%d]%s",
exception,
attempt,
self._max_attempts,
self._traceback(exception),
extra={"event": {"type": "pulp-retry"}},
)

def _traceback(self, exception):
out = ""

if isinstance(exception, TaskFailedException):
# No point logging traceback of this exception, the message
# "task failed: <id>" is already all the useful info
pass
elif hasattr(exception, "__traceback__"):
out = "\n" + "".join(
traceback.format_exception(None, exception, exception.__traceback__)
)

return out
21 changes: 18 additions & 3 deletions tests/client/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import requests_mock
from pubtools.pulplib import Client, Criteria, Repository, PulpException

from more_executors.retry import ExceptionRetryPolicy


def test_can_construct(requests_mocker):
client = Client("https://pulp.example.com/")
Expand Down Expand Up @@ -40,7 +38,7 @@ def test_can_search(client, requests_mocker):
assert requests_mocker.call_count == 1


def test_search_retries(client, requests_mocker):
def test_search_retries(client, requests_mocker, caplog):
requests_mocker.post(
"https://pulp.example.com/pulp/api/v2/repositories/search/",
[
Expand All @@ -62,6 +60,23 @@ def test_search_retries(client, requests_mocker):
# But would have needed several attempts
assert requests_mocker.call_count == 4

# And those retries should have been logged
messages = caplog.messages
assert len(messages) == 3

# Messages have full exception detail. Just check the first line.
lines = [m.splitlines()[0] for m in messages]

assert lines[0].startswith("Retrying due to error: 413")
assert lines[0].endswith(" [1/6]")
assert lines[1].startswith("Retrying due to error: 500")
assert lines[1].endswith(" [2/6]")
assert lines[2].startswith("Retrying due to error:")
assert lines[2].endswith(" [3/6]")

# Retry logs should have the pulp-retry event attached.
assert caplog.records[-1].event == {"type": "pulp-retry"}


def test_search_can_paginate(client, requests_mocker):
client._PAGE_SIZE = 10
Expand Down
9 changes: 5 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from more_executors.retry import ExceptionRetryPolicy
import pytest
import requests_mock

from pubtools.pulplib import Client
from pubtools.pulplib._impl.client.poller import TaskPoller
from pubtools.pulplib._impl.client.retry import PulpRetryPolicy


@pytest.fixture
Expand Down Expand Up @@ -36,9 +36,10 @@ def fast_poller():
TaskPoller.DELAY = old_delay


def fast_retry_policy(_client):
return ExceptionRetryPolicy(max_attempts=6, max_sleep=0.001)
class FastRetryPolicy(PulpRetryPolicy):
def __init__(self):
super(FastRetryPolicy, self).__init__(max_attempts=6, max_sleep=0.001)


class FastRetryClient(Client):
_RETRY_POLICY = fast_retry_policy
_RETRY_POLICY = lambda *_: FastRetryPolicy()
8 changes: 7 additions & 1 deletion tests/repository/test_publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def test_publish_fail(fast_poller, requests_mocker, client):
assert "Task task1 failed" in str(error)


def test_publish_retries(fast_poller, requests_mocker, client):
def test_publish_retries(fast_poller, requests_mocker, client, caplog):
"""publish retries distributors as they fail"""
repo = Repository(
id="some-repo",
Expand Down Expand Up @@ -220,3 +220,9 @@ def test_publish_retries(fast_poller, requests_mocker, client):
"cdn_distributor",
"cdn_distributor",
]

# The retry should have been logged
messages = caplog.messages
assert (
messages[0].splitlines()[0] == "Retrying due to error: Task task2 failed [1/6]"
)

0 comments on commit 924600d

Please sign in to comment.