sapcc
diff --git a/‎networking_nsxv3/common/synchronization.py
Lines changed: 213 additions & 49 deletions b/‎networking_nsxv3/common/synchronization.py
Lines changed: 213 additions & 49 deletions
@@ -1,6 +1,8 @@
 """
 Synchronization - classes related concurrent execution scheduling and limits
 """
+from typing import Callable, Union, Optional
+
 import eventlet
 eventlet.monkey_patch()
 
@@ -66,12 +68,40 @@ def retry_next(self):
         return False
 
 
+CBPARAMS = Union[str, dict]
+
+
 class Runnable(object):
 
-    def __init__(self, idn, fn, priority=Priority.LOWEST):
+    def __init__(self, fnparams: CBPARAMS, fn: Callable[[CBPARAMS], None], priority=Priority.LOWEST):
         self.priority = priority
-        self.idn = idn
-        self.fn = fn
+
+        # for consistent hashing, these must never be changed, when a Runnable is used as hashable:
+        self._fnparams = fnparams
+        self._fn = fn
+
+        # contradicting to code comments, we sometimes get a dictionary
+        # as parameter for the callback. Some data from that
+        # dictionary is then actually used, e.g. revision and resource id.
+        # Not going to happen. Fixing that would require major
+        # refactoring of the rpc and realizer objects. Not going to happen.
+        #
+        # Instead, in these cases we will still use the openstack id and the
+        # name of the callback function to prevent parallel running of jobs.
+        # But when a job is submitted to the Rerunner, it will keep track of the
+        # dictionary contents if one exists and keep these for rerunning. So essentially
+        # we create subjobs.
+        # Realistically we need a better data structure than the active queue and Rerunner.
+
+        # fnparams is a str for most of the callbacks:
+        if isinstance(fnparams, str):
+            self.idn = fnparams
+        elif isinstance(fnparams, dict):
+            self.idn = fnparams['id']
+        else:
+            self.idn = str(fnparams)
+            LOG.warning('unexpected type %s for job parameters of %s', type(fnparams), fn.__name__)
+
         self._runcount = 0
         self._created = time.time()
         self._scheduled = None
@@ -81,7 +111,10 @@ def __init__(self, idn, fn, priority=Priority.LOWEST):
 
     @property
     def identifier(self) -> tuple:
-        return (self.idn, self.fn.__name__)
+        return (self.idn, self._fn.__name__)
+
+    def debugid(self)->str:
+        return str((self.idn, self._fn.__name__, str(self._fnparams)))
 
     def set_scheduled(self):
         """ called when we submit the job to the worker pool """
@@ -90,7 +123,7 @@ def set_scheduled(self):
         self._jobdone = None
         self._rescheduled = None
 
-    def set_start(self):
+    def _set_start(self):
         """ called in our wrapper when we actually start fn """
         # we need to reset the other timings
         # because we use the same job for rerunning
@@ -99,7 +132,7 @@ def set_start(self):
         self._rescheduled = None
         self._runcount += 1
 
-    def set_done(self):
+    def _set_done(self):
         """ called in our wrapper when fn returns """
         self._jobdone = time.time()
         self._rescheduled = None
@@ -132,6 +165,13 @@ def get_statline(self) -> str:
         return (f"timings for job {self} - runcount: {self._runcount} age: {age} "
                 f"scheduled: {scheduled} started: {started} runtime: {runtime} rescheduled: {rescheduled}")
 
+    def execute(self):
+        self._set_start()
+        try:
+            self._fn(self._fnparams)
+        finally:
+            self._set_done()
+
     def __repr__(self):
         # lets not just use the object id, maybe
         return str(self.identifier)
@@ -141,9 +181,11 @@ def __eq__(self, other):
         Note, the priority is not part of the comparison
         Thus if a runnable with higher priority is about to be
         added to the queue it will be rejected silently.
+        To prevent starvation, the queue will update the priority of the
+        existing element, in case it was lower than the item that was about to be added.
         """
         if isinstance(other, Runnable):
-            return (self.idn == other.idn and self.fn == other.fn)
+            return (self._fnparams  == other._fnparams  and self._fn == other._fn)
         else:
             return False
 
@@ -162,11 +204,6 @@ def __lt__(self, other):
 
         return self.priority < other.priority
 
-    def __hash__(self):
-        # with the original repr this is broken, as __eq__ takes the fn into
-        # account as well!
-        return hash((self.idn, self.fn))
-
 
 class UniqFiFoQueue(eventlet.queue.Queue):
     """
@@ -238,14 +275,147 @@ def _get(self):
         return heapq.heappop(self.queue)
 
 
+class JobList():
+    """
+        We are keeping track of the jobs and their parameters for the
+        JobRerunner, based on the type of job and openstack id.
+        There can be multiple similar jobs (same id, same method) but
+        with different parameters, if the parameter is a dict.
+        In these cases this JobList will keep track of the jobs, because
+        a job with the same additional parameters will compare equal.
+
+        add:
+
+        When a job is added the _runnables list can be empty, then the job
+        can run and we store it here for reference, with count 1.
+        If the list is not empty the job can either already exist or it
+        can be a job with different additional parameters.
+
+        If the job already exists, we increase the counter, and do not allow it to run.
+        If the job does not exist, we add it to the list with count = 1 and allow it to run.
+
+        done:
+
+        When a job is done, we will look at our list, and decrease the counter.
+        If the counter is 0, the job was not submitted a second time, and we can remove
+        it from the list.
+        If the counter is not 0, the job was requested to run again, and we keep it in the
+        list with the updated counter.
+
+        done will then choose a job from the list, that is supposed to run again,
+        remove it from the list and return it to the JobRerunner.
+
+        This might be the same job that was just finished or it could be a different one.
+        For now we will choose the oldest one based on age, which should be the same job that
+        was just done, but we might want to change that so we use a helper function for that
+        for now in the POC.
+
+    """
+    def __init__(self):
+        self._job_identifier: Optional[str] = None
+        self._runnables: List[tuple[int, Runnable]] = []
+
+    def __len__(self):
+        return len(self._runnables)
+
+    @property
+    def size(self):
+        return sum( count for count, _ in self._runnables)
+
+    def __repr__(self):
+        return f"Joblist: {self._job_identifier}, len={len(self)}, {self._runnables}"
+
+    def add(self, job:Runnable)->bool:
+
+        if self._job_identifier is not None:
+            if job.identifier != self._job_identifier:
+                raise ValueError("Can only add jobs of same type to a JobList")
+        else:
+            self._job_identifier = job.identifier
+
+        # search through our list and update the counter or append the job:
+        for index, (count, existing_job) in enumerate(self._runnables):
+            if job == existing_job:
+                count += 1
+                self._runnables[index] = (count, existing_job)
+                return False
+        else:
+            # this is the first of its kind, we can run it.
+            # note that a job that gets re-executed will be removed from
+            # the list (with the others of the same kind still present)
+            # so when it returns it will be the only one of its kind and
+            # can run. after it is finished a different one will be returned by done.
+            self._runnables.append((1, job))
+            return True
+
+    def _runnable_is_done(self, job:Runnable):
+        # search through our list and update the counter or remove the job:
+        for index, (count, existing_job) in enumerate(self._runnables):
+            if job == existing_job:
+                # we do not need this job with this parameters again, it is done,
+                # so we remove it from the list.
+                # Note: the list might not be empty!
+                LOG.debug("Job %s is done, updating JobList, request count was: %d", job.debugid(), count)
+                count -= 1
+                if count <= 0:
+                    if count < 0:
+                        LOG.warning("Job count in JobList was %d for %s", count, job.debugid())
+                    del self._runnables[index]
+                    return
+                self._runnables[index] = (count, existing_job)
+                return
+        else:
+            # we should never mark a job done, that was not added in the first place,
+            # if its not in the list, something is wrong
+            raise KeyError(f'No such job {job}')
+
+    def _runnable_pop_next(self) -> Optional[Runnable]:
+        """ find the next job to run and remove it from the list,
+            or return None if there is None to run.
+        """
+        if not self._runnables:
+            return None
+        # no job should currently be running, because we are in the "done" part of the
+        # workflow. So we can choose any job we like, remove it from the list and run it.
+        # when that job returns and needs no re-execution it will be removed from the list,
+        # and we will not pop it here again, so the next one in line will be returned.
+        # we always append to our list, so we can just pop the first one here and get the oldest.
+        count, job = self._runnables.pop()
+        LOG.debug("found job to run next with %d rerun requests open: %s", count, job)
+        return job
+
+    def done(self, job:Runnable)->Optional[Runnable]:
+        if self._job_identifier is not None:
+            if job.identifier != self._job_identifier:
+                raise ValueError("Can only remove jobs of same type from a JobList")
+        else:
+            raise KeyError("JobList is empty")
+
+        self._runnable_is_done(job)
+        return self._runnable_pop_next()
+
+    def get_count(self, job):
+
+        if self._job_identifier is not None:
+            if job.identifier != self._job_identifier:
+                raise ValueError("Can only remove jobs of same type from a JobList")
+
+        for count, existing_job in self._runnables:
+            if job == existing_job:
+                return count
+
+        return 0
+
+
 class JobRerunner():
     """ Thread save data structure to reschedule jobs when they are already running
 
     When a job is retrieved from the active queue and already running in a worker thread,
     there is a chance that another job for the same object is added to the active
-    queue and also started in a worker. While there then is some locking happening
-    that should prevent race conditions, this still blocks the worker thread(s),
-    which degrades performance and re-executes the job unnecessarily often.
+    queue and also started in a worker. To prevent race conditions, the worker threads
+    will use a lock to prevent two jobs running on the same objects, but this leads to
+    blocking of each of the affected workers, which degrades performance and also re-executes
+    the jobs unnecessarily often.
 
     To prevent this, we use this JobRerunner:
 
@@ -275,8 +445,8 @@ class JobRerunner():
     _lname_torerun = 'JobRerunner-torerun'
 
     def __init__(self):
-        self._running = dict()
-        self._to_rerun = collections.deque()
+        self._running: collections.defaultdict[(int, str),JobList] = collections.defaultdict(JobList)
+        self._to_rerun: collections.deque[Runnable] = collections.deque()
 
     def get_rerunnable(self) -> Runnable:
         # Let's also use the LockManager that is used for locking
@@ -296,26 +466,19 @@ def get_rerunnable(self) -> Runnable:
 
     def job_done(self, job: Runnable):
         LOG.debug("JobRerunner job_done called for %s", job)
+
         with LockManager.get_lock(self._lname_running):
-            count = self._running.get(job, 0)
 
-            if count == 1:
-                del self._running[job]
+            joblist: JobList = self._running[job.identifier]
+            next_job = joblist.done(job)
+            if not next_job:
+                del self._running[job.identifier]
                 LOG.info("JobRerunner (done, no reruns requested) %s", job.get_statline())
-            elif count > 1:
-                # we only allow exactly one job to run at a time,
-                # all jobs arriving later will increase the counter while
-                # the job is still running or they get re-queued.
-                # if a job is in the ready deque it will at some point
-                # re-appear and so we can forget about the counter.
-                LOG.info("JobRerunner (done, %d reruns requested) %s", count - 1, job.get_statline())
-                del self._running[job]
-                with LockManager.get_lock(self._lname_torerun):
-                    self._to_rerun.append(job)
             else:
-                # prevent the error from spreading
-                del self._running[job]
-                LOG.warning("JobRerunner job_done called too often for job %s", job)
+                # we got a job to rerun from our helper
+                LOG.info("JobRerunner (done, got rerun) done: %s next: %s", job.get_statline(), next_job.get_statline())
+                with LockManager.get_lock(self._lname_torerun):
+                    self._to_rerun.append(next_job)
 
     def add_job(self, job: Runnable) -> bool:
         """ Add job to list of jobs running/to be started or mark it for re-execution
@@ -327,22 +490,20 @@ def add_job(self, job: Runnable) -> bool:
 
         """
         with LockManager.get_lock(self._lname_running):
-            count = self._running.get(job, 0)
-            if count <= 0:
+            joblist:JobList = self._running[job.identifier]
+
+            if joblist.add(job):
                 # no job running, we can run the job
-                # if the counter is < 0 our accounting is wrong,
-                # so we fix it and run the job.
-                self._running[job] = 1
                 LOG.debug("JobRerunner no identical job is currently running, can start %s", job)
                 return True
-
-            self._running[job] = count + 1
-            LOG.debug("JobRerunner job %s already running, marked for rescheduling, count: %d ", job, count)
+            else:
+                count = joblist.get_count(job)
+                LOG.debug("JobRerunner job %s already running, marked for rescheduling, count: %d ", job, count)
 
             sum = 0
-            for job, scount in self._running.items():
-                sum += scount
-                LOG.debug("JobRerunner stat: job %s is running, submission count: %d", job, scount)
+            for identifier, joblist in self._running.items():
+                sum += joblist.size
+                LOG.debug("JobRerunner stat: job %s is running, submission count: %d", identifier, joblist.size)
 
         # let's log these as info for debugging, they should be sufficient in prod
         # to find issues with the JobRerunner:
@@ -379,9 +540,14 @@ def __init__(self, active_size=INFINITY, passive_size=INFINITY,
     def run(self, priority, ids, fn):
         """ Submit a job with priority
 
+        Note: the second parameter apparently sometimes is a dictionary, in contrast
+              to the documentation in the code!
+              Fixing this would requires too much refactoring at the moment -- mutax
+
         Keyword arguments:
         priority -- job priority of type Priority.class
-        ids -- list of IDs (identifiers) that will be passed to the 'fn'
+        ids -- list of OpenStack-IDs (identifiers) that will be passed to the 'fn'
+               OR list of dictionaries(!) of OpenStack objects (containing their id)
         fn -- a function about to be executed by the runner with an argument ID
         """
         if self._state != "started":
@@ -419,15 +585,13 @@ def _start(self):
                 # check if we are allowed to run it,
                 # if yes mark it as running and spawn it
                 if self._rerunner.add_job(job):
-                    LOG.info(MESSAGE.format("Processing", job.idn, Priority(job.priority).name, job.fn.__name__))
+                    LOG.info(MESSAGE.format("Processing", job.idn, Priority(job.priority).name, job))
 
                     # ideally we would be able to add a callback to the
                     # greenthread, but this is hidden in the pool, so
                     # let's wrap the function once more.
                     def wrap(rerun, ajob):
-                        ajob.set_start()
-                        ajob.fn(ajob.idn)
-                        ajob.set_done()
+                        ajob.execute()
                         rerun.job_done(ajob)
 
                     job.set_scheduled()