Reset restart failed count on update status (#29)

canonical · Mar 25, 2024 · 307dc62 · 307dc62
1 parent b0a012b
commit 307dc62
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 1 deletion.
diff --git a/src/charm.py b/src/charm.py
@@ -109,6 +109,11 @@ def _on_update_status(self, _: ops.UpdateStatusEvent) -> None:
             self.model.unit.status = ops.BlockedStatus("Waiting for relation.")
             return
 
+        # set NRestart of the service back to 0
+        # We do it here because at this point we can be certain that
+        # the service is up and running
+        self.jenkins_agent_service.reset_failed_state()
+
         self.model.unit.status = ops.ActiveStatus()
 
 

diff --git a/src/service.py b/src/service.py
@@ -17,7 +17,7 @@
 
 logger = logging.getLogger(__name__)
 AGENT_SERVICE_NAME = "jenkins-agent"
-APT_PACKAGE_VERSION = "1.0.8"
+APT_PACKAGE_VERSION = "1.0.9"
 APT_PACKAGE_NAME = f"jenkins-agent-{APT_PACKAGE_VERSION}"
 SYSTEMD_SERVICE_CONF_DIR = "/etc/systemd/system/jenkins-agent.service.d/"
 PPA_URI = "https://ppa.launchpadcontent.net/canonical-is-devops/jenkins-agent-charm/ubuntu/"
@@ -166,6 +166,20 @@ def restart(self) -> None:
         if not self._startup_check():
             raise ServiceRestartError("Error waiting for the agent service to start")
 
+    def reset_failed_state(self) -> None:
+        """Reset NRestart count of service back to 0.
+
+        The service keeps track of the 'restart-count' and blocks further restarts
+        if the maximum allowed is reached. This count is not reset when the service restarts
+        so we need to do it manually.
+        """
+        try:
+            # Disable protected-access here because reset-failed is not implemented in the lib
+            systemd._systemctl("reset-failed", AGENT_SERVICE_NAME)  # pylint: disable=W0212
+        except systemd.SystemdError:
+            # We only log the exception here as this is not critical
+            logger.error("Failed to reset failed state")
+
     def reset(self) -> None:
         """Stop the agent service and clear its configuration file.
 

diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
@@ -10,6 +10,7 @@
 import ops
 import ops.testing
 import pytest
+from charms.operator_libs_linux.v1 import systemd
 
 import charm_state
 import service
@@ -173,6 +174,27 @@ def test_update_status_service_active(
     """
     harness.add_relation(charm_state.AGENT_RELATION, "jenkins-k8s")
     monkeypatch.setattr(service.JenkinsAgentService, "is_active", PropertyMock(return_value=True))
+    monkeypatch.setattr(systemd, "_systemctl", MagicMock(side_effect=systemd.SystemdError))
+
+    harness.begin()
+
+    harness.charm.on.update_status.emit()
+
+    assert harness.charm.unit.status.name == ops.ActiveStatus.name
+
+
+def test_update_status_reset_failed_state_systemd_error(
+    harness: ops.testing.Harness, monkeypatch: pytest.MonkeyPatch
+):
+    """
+    arrange: given a charm with relation to jenkins and the service is active.
+    act: when update-status hook is fired with reset-failed raising an error.
+    assert: The charm correctly ignore the error and sets the status to active.
+    """
+    harness.add_relation(charm_state.AGENT_RELATION, "jenkins-k8s")
+    monkeypatch.setattr(service.JenkinsAgentService, "is_active", PropertyMock(return_value=True))
+    monkeypatch.setattr(systemd, "_systemctl", MagicMock(side_effect=systemd.SystemdError))
+
     harness.begin()
 
     harness.charm.on.update_status.emit()