[wip][tosquash] review comments and minor changes

mutax · mutax · commit 7ff517d6c041 · 2025-02-21T11:11:36.000+01:00
to be squashed into the existing commits when we are happy with the PR
diff --git a/networking_nsxv3/common/synchronization.py b/networking_nsxv3/common/synchronization.py
@@ -216,9 +216,9 @@ def _get(self):
 
 
 class JobRerunner():
-    """ Thread save data structure to reschedule jobs when it is already running
+    """ Thread save data structure to reschedule jobs when they are already running
 
-    When a job is retrieved from the active queue and running in a worker thread,
+    When a job is retrieved from the active queue and already running in a worker thread,
     there is a chance that another job for the same object is added to the active
     queue and also started in a worker. While there then is some locking happening
     that should prevent race conditions, this still blocks the worker thread(s),
@@ -248,18 +248,21 @@ class JobRerunner():
     to pick up changes quickly.
     """
 
+    _lname_running = 'JobRerunner-running'
+    _lname_torerun = 'JobRerunner-torerun'
+
     def __init__(self):
-        self._ready = collections.deque()
         self._running = dict()
+        self._to_rerun = collections.deque()
 
-    def pop(self) -> Runnable:
+    def get_rerunnable(self) -> Runnable:
         # Let's also use the LockManager that is used for locking
         # in the code already. A simpler lock might do fine, but
         # for the LockManager we know it is working.
 
-        with LockManager.get_lock("JobRerunner-deque"):
+        with LockManager.get_lock(self._lname_torerun):
             try:
-                job = self._ready.popleft()
+                job = self._to_rerun.popleft()
                 LOG.debug("JobRerunner had rerunnable job: %s", job)
                 LOG.info("JobRerunner (rerun) %s", job.get_statline())
             except IndexError:
@@ -270,7 +273,7 @@ def pop(self) -> Runnable:
 
     def job_done(self, job: Runnable):
         LOG.debug("JobRerunner job_done called for %s", job)
-        with LockManager.get_lock("JobRerunner-running"):
+        with LockManager.get_lock(self._lname_running):
             count = self._running.get(job, 0)
 
             if count == 1:
@@ -286,8 +289,8 @@ def job_done(self, job: Runnable):
                 LOG.info("JobRerunner (requeue) %s", job.get_statline())
                 LOG.debug("JobRerunner job %s is done, %d reruns requested, marking it for re-execution", job, count)
                 del self._running[job]
-                with LockManager.get_lock("JobRerunner-deque"):
-                    self._ready.append(job)
+                with LockManager.get_lock(self._lname_torerun):
+                    self._to_rerun.append(job)
             else:
                 # prevent the error from spreading
                 del self._running[job]
@@ -302,7 +305,7 @@ def add_job(self, job: Runnable) -> bool:
         returns False if the job is already running and was marked for re-execution
 
         """
-        with LockManager.get_lock("JobRerunner-running"):
+        with LockManager.get_lock(self._lname_running):
             count = self._running.get(job, 0)
             if count <=0:
                 # no job running, we can run the job
@@ -323,7 +326,7 @@ def add_job(self, job: Runnable) -> bool:
         # let's log these as info for debugging, they should be sufficient in prod
         # to find issues with the JobRerunner:
         LOG.info("JobRerunner stat: %d jobs waiting, total submission count: %d", len(self._running), sum)
-        LOG.info("JobRerunner stat: %d jobs ready for re-execution", len(self._ready))
+        LOG.info("JobRerunner stat: %d jobs ready for re-execution", len(self._to_rerun))
         return False
 
 
@@ -386,11 +389,11 @@ def _start(self):
                 if self.active() < self._idle and self.passive() > 0:
                     self._active.put_nowait(self._passive.get_nowait())
                     self._passive.task_done()
-                job = self._rerunner.pop()
-                from_queue = False
+                pulled_from_queue = False
+                job = self._rerunner.get_rerunnable()
                 if not job:
                     job = self._active.get(block=True, timeout=TIMEOUT)
-                    from_queue = True
+                    pulled_from_queue = True
 
                 # check if we are allowed to run it,
                 # if yes mark it as running and spawn it
@@ -408,7 +411,7 @@ def wrap(rerun, ajob):
 
                     self._workers.spawn(wrap, self._rerunner, job)
 
-                if from_queue:
+                if pulled_from_queue:
                     self._active.task_done()
             except eventlet.queue.Empty:
                 LOG.info("No activity for the last {} seconds.".format(TIMEOUT))
diff --git a/networking_nsxv3/tests/unit/realization/test_coordination.py b/networking_nsxv3/tests/unit/realization/test_coordination.py
@@ -40,7 +40,7 @@ def nop(item):
             rerunner.job_done(job)
 
         # no job was added twice, so no job should be re-executed
-        self.assertEqual(rerunner.pop(), None)
+        self.assertEqual(rerunner.get_rerunnable(), None)
 
     def test_rerunner_rerun(self):
         from networking_nsxv3.common.synchronization import JobRerunner, Runnable
@@ -66,14 +66,14 @@ def nop(item):
             self.assertFalse(ret)
 
         # mark them all done, then expect them
-        # all to be returned once in pop,
+        # all to be returned once in get_rerunnable,
         # although added twice again above
         for some_id in range(100):
             job = Runnable(str(some_id), nop)
             rerunner.job_done(job)
 
         alljobs = []
-        while job := rerunner.pop():
+        while job := rerunner.get_rerunnable():
             alljobs.append(job)
 
         # each job should be returned once