@@ -114,7 +114,7 @@ def get_statline(self) -> str:
114
114
115
115
scheduled = '-'
116
116
started = '-'
117
- jobdone = '-'
117
+ runtime = '-'
118
118
rescheduled = '-'
119
119
120
120
if self ._scheduled and self ._created :
@@ -124,13 +124,13 @@ def get_statline(self) -> str:
124
124
started = f"{ self ._started - self ._scheduled :0.4f} "
125
125
126
126
if self ._jobdone and self ._started :
127
- jobdone = f"{ self ._jobdone - self ._started :0.4f} "
127
+ runtime = f"{ self ._jobdone - self ._started :0.4f} "
128
128
129
129
if self ._rescheduled and self ._jobdone :
130
130
rescheduled = f"{ self ._rescheduled - self ._jobdone :0.4f} "
131
131
132
- return (f"jobtimings: { self . identifier } : runcount: { self ._runcount } age: { age } "
133
- f"scheduled: { scheduled } started: { started } runtime: { jobdone } rescheduled: { rescheduled } " )
132
+ return (f"timings for job { self } - runcount: { self ._runcount } age: { age } "
133
+ f"scheduled: { scheduled } started: { started } runtime: { runtime } rescheduled: { rescheduled } " )
134
134
135
135
def __repr__ (self ):
136
136
# lets not just use the object id, maybe
@@ -155,6 +155,11 @@ def __lt__(self, other):
155
155
Only the passive queue is ordered by priority.
156
156
The active queue is FiFo.
157
157
"""
158
+ # if the priority is equal, we want to order
159
+ # by creation time to handle oldest jobs first
160
+ if self .priority == other .priority :
161
+ return self ._created < other ._created
162
+
158
163
return self .priority < other .priority
159
164
160
165
def __hash__ (self ):
@@ -282,8 +287,7 @@ def get_rerunnable(self) -> Runnable:
282
287
try :
283
288
job = self ._to_rerun .popleft ()
284
289
job .set_rescheduled ()
285
- LOG .debug ("JobRerunner had rerunnable job: %s" , job )
286
- LOG .info ("JobRerunner (rerun) %s" , job .get_statline ())
290
+ LOG .info ("JobRerunner (about to rerun) %s" , job .get_statline ())
287
291
except IndexError :
288
292
job = None
289
293
LOG .debug ("JobRerunner had no rerunnable job" )
@@ -297,16 +301,14 @@ def job_done(self, job: Runnable):
297
301
298
302
if count == 1 :
299
303
del self ._running [job ]
300
- LOG .debug ("JobRerunner job %s is done, no reruns requested" , job )
301
- LOG .info ("JobRerunner (done) %s" , job .get_statline ())
304
+ LOG .info ("JobRerunner (done, no reruns requested) %s" , job .get_statline ())
302
305
elif count > 1 :
303
306
# we only allow exactly one job to run at a time,
304
307
# all jobs arriving later will increase the counter while
305
308
# the job is still running or they get re-queued.
306
309
# if a job is in the ready deque it will at some point
307
310
# re-appear and so we can forget about the counter.
308
- LOG .info ("JobRerunner (requeue) %s" , job .get_statline ())
309
- LOG .debug ("JobRerunner job %s is done, %d reruns requested, marking it for re-execution" , job , count )
311
+ LOG .info ("JobRerunner (done, %d reruns requested) %s" , count - 1 , job .get_statline ())
310
312
del self ._running [job ]
311
313
with LockManager .get_lock (self ._lname_torerun ):
312
314
self ._to_rerun .append (job )
@@ -344,8 +346,8 @@ def add_job(self, job: Runnable) -> bool:
344
346
345
347
# let's log these as info for debugging, they should be sufficient in prod
346
348
# to find issues with the JobRerunner:
347
- LOG .info ("JobRerunner stat: %d jobs waiting , total submission count: %d" , len ( self . _running ), sum )
348
- LOG . info ( "JobRerunner stat: %d jobs ready for re-execution" , len (self ._to_rerun ))
349
+ LOG .info ("JobRerunner stat: %d jobs tracked , total submission count: %d, ready for re-exection: %d" ,
350
+ len ( self . _running ), sum , len (self ._to_rerun ))
349
351
return False
350
352
351
353
0 commit comments