Ensure retries are logged

sdelements · Jun 14, 2019 · 924600d · 924600d
1 parent 4f0fbcf
commit 924600d
Show file tree

Hide file tree

Showing 8 changed files with 119 additions and 14 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -95,4 +95,5 @@ disable=print-statement,
         protected-access,
         # we use black code style and don't need additional checks
         bad-continuation,
+        line-too-long,
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Task error_summary and error_details are now initialized appropriately
   with data from Pulp
 - Client now logs Pulp load every few minutes when awaiting tasks
+- Client now logs warnings when Pulp operations are being retried
 
 ## [0.1.1] - 2019-06-13
 

diff --git a/docs/logging.rst b/docs/logging.rst
@@ -30,3 +30,26 @@ These messages will include an ``event`` attribute of the form:
     type: awaiting-pulp
     running-tasks: <count>
     waiting-tasks: <count>
+
+
+Retrying
+--------
+
+Many methods in this library will implicitly retry failing operations a few times.
+When this occurs, a ``WARNING`` message is logged before the retry occurs, as
+in the following examples:
+
+.. code-block::
+
+  [WARNING] Retrying due to error: 401 Client Error: Unauthorized for url: https://pulp.example.com/pulp/api/v2/repositories/search/ [1/10]
+  [WARNING] Retrying due to error: Task e239ae4f-7fad-4004-bfb6-8e06f17d22ef failed [3/10]
+
+The ``[1/10]`` indicator shows the current attempt at the operation and the maximum
+number of attempts before the error will be considered fatal.
+
+These messages will include an ``event`` attribute of the form:
+
+.. code-block:: yaml
+
+  event:
+    type: pulp-retry
diff --git a/pubtools/pulplib/_impl/client/client.py b/pubtools/pulplib/_impl/client/client.py
@@ -54,7 +54,7 @@ class Client(object):
     _REQUEST_THREADS = int(os.environ.get("PUBTOOLS_PULPLIB_REQUEST_THREADS", "4"))
     _PAGE_SIZE = int(os.environ.get("PUBTOOLS_PULPLIB_PAGE_SIZE", "2000"))
     _TASK_THROTTLE = int(os.environ.get("PUBTOOLS_PULPLIB_TASK_THROTTLE", "500"))
-    _RETRY_POLICY = lambda *_: retry.new_policy()
+    _RETRY_POLICY = lambda *_: retry.PulpRetryPolicy()
 
     def __init__(self, url, **kwargs):
         """

diff --git a/pubtools/pulplib/_impl/client/retry.py b/pubtools/pulplib/_impl/client/retry.py
@@ -1,7 +1,65 @@
-from more_executors.retry import ExceptionRetryPolicy
+import os
+import logging
+import traceback
 
+from more_executors.retry import RetryPolicy, ExceptionRetryPolicy
 
-def new_policy():
-    # TODO: we'll need our own policy with improved logging and handling of
-    # certain "no point in retrying" errors
-    return ExceptionRetryPolicy()
+from .errors import TaskFailedException
+
+LOG = logging.getLogger("pubtools.pulplib")
+
+ATTEMPTS = int(os.environ.get("PUBTOOLS_PULPLIB_RETRY_ATTEMPTS", "10"))
+SLEEP = float(os.environ.get("PUBTOOLS_PULPLIB_RETRY_SLEEP", "1.0"))
+MAX_SLEEP = float(os.environ.get("PUBTOOLS_PULPLIB_RETRY_MAX_SLEEP", "120.0"))
+
+
+class PulpRetryPolicy(RetryPolicy):
+    def __init__(self, max_attempts=ATTEMPTS, sleep=SLEEP, max_sleep=MAX_SLEEP):
+        super(PulpRetryPolicy, self).__init__()
+        self._max_attempts = max_attempts
+        self._delegate = ExceptionRetryPolicy(
+            max_attempts=max_attempts, sleep=sleep, max_sleep=max_sleep
+        )
+
+    def should_retry(self, attempt, future):
+        # This is still using only the default ExceptionRetryPolicy behavior.
+        # I expect at some point we will need to fine-tune this and look
+        # at the error in more detail in order to decide whether to retry.
+        retry = self._delegate.should_retry(attempt, future)
+
+        if future.exception() and retry:
+            self._log_retry(attempt, future)
+
+        return retry
+
+    def sleep_time(self, attempt, future):
+        return self._delegate.sleep_time(attempt, future)
+
+    def _log_retry(self, attempt, future):
+        # TODO: it would be nice if we could add some info here about
+        # *what* is being retried. But it's a bit of a challenge since
+        # future objects themselves don't come with any info on what was
+        # done to create the future.
+        exception = future.exception()
+        LOG.warning(
+            "Retrying due to error: %s [%d/%d]%s",
+            exception,
+            attempt,
+            self._max_attempts,
+            self._traceback(exception),
+            extra={"event": {"type": "pulp-retry"}},
+        )
+
+    def _traceback(self, exception):
+        out = ""
+
+        if isinstance(exception, TaskFailedException):
+            # No point logging traceback of this exception, the message
+            # "task failed: <id>" is already all the useful info
+            pass
+        elif hasattr(exception, "__traceback__"):
+            out = "\n" + "".join(
+                traceback.format_exception(None, exception, exception.__traceback__)
+            )
+
+        return out
diff --git a/tests/client/test_client.py b/tests/client/test_client.py
@@ -2,8 +2,6 @@
 import requests_mock
 from pubtools.pulplib import Client, Criteria, Repository, PulpException
 
-from more_executors.retry import ExceptionRetryPolicy
-
 
 def test_can_construct(requests_mocker):
     client = Client("https://pulp.example.com/")
@@ -40,7 +38,7 @@ def test_can_search(client, requests_mocker):
     assert requests_mocker.call_count == 1
 
 
-def test_search_retries(client, requests_mocker):
+def test_search_retries(client, requests_mocker, caplog):
     requests_mocker.post(
         "https://pulp.example.com/pulp/api/v2/repositories/search/",
         [
@@ -62,6 +60,23 @@ def test_search_retries(client, requests_mocker):
     # But would have needed several attempts
     assert requests_mocker.call_count == 4
 
+    # And those retries should have been logged
+    messages = caplog.messages
+    assert len(messages) == 3
+
+    # Messages have full exception detail. Just check the first line.
+    lines = [m.splitlines()[0] for m in messages]
+
+    assert lines[0].startswith("Retrying due to error: 413")
+    assert lines[0].endswith(" [1/6]")
+    assert lines[1].startswith("Retrying due to error: 500")
+    assert lines[1].endswith(" [2/6]")
+    assert lines[2].startswith("Retrying due to error:")
+    assert lines[2].endswith(" [3/6]")
+
+    # Retry logs should have the pulp-retry event attached.
+    assert caplog.records[-1].event == {"type": "pulp-retry"}
+
 
 def test_search_can_paginate(client, requests_mocker):
     client._PAGE_SIZE = 10

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,9 +1,9 @@
-from more_executors.retry import ExceptionRetryPolicy
 import pytest
 import requests_mock
 
 from pubtools.pulplib import Client
 from pubtools.pulplib._impl.client.poller import TaskPoller
+from pubtools.pulplib._impl.client.retry import PulpRetryPolicy
 
 
 @pytest.fixture
@@ -36,9 +36,10 @@ def fast_poller():
     TaskPoller.DELAY = old_delay
 
 
-def fast_retry_policy(_client):
-    return ExceptionRetryPolicy(max_attempts=6, max_sleep=0.001)
+class FastRetryPolicy(PulpRetryPolicy):
+    def __init__(self):
+        super(FastRetryPolicy, self).__init__(max_attempts=6, max_sleep=0.001)
 
 
 class FastRetryClient(Client):
-    _RETRY_POLICY = fast_retry_policy
+    _RETRY_POLICY = lambda *_: FastRetryPolicy()
diff --git a/tests/repository/test_publish.py b/tests/repository/test_publish.py
@@ -168,7 +168,7 @@ def test_publish_fail(fast_poller, requests_mocker, client):
     assert "Task task1 failed" in str(error)
 
 
-def test_publish_retries(fast_poller, requests_mocker, client):
+def test_publish_retries(fast_poller, requests_mocker, client, caplog):
     """publish retries distributors as they fail"""
     repo = Repository(
         id="some-repo",
@@ -220,3 +220,9 @@ def test_publish_retries(fast_poller, requests_mocker, client):
         "cdn_distributor",
         "cdn_distributor",
     ]
+
+    # The retry should have been logged
+    messages = caplog.messages
+    assert (
+        messages[0].splitlines()[0] == "Retrying due to error: Task task2 failed [1/6]"
+    )