Skip to content

Commit 4958eac

Browse files
committed
[IMP] runbot: global state computation
One of the most problematic concurrency issue is when multiple children tries to write the state/result on the parent concurrently. Multiple partial solutions are possible here: - avoid to write if the state doesn't changes - avoid to write on a build belonging to another host A maybe overkill solution would be to add a message queue for an host, signaling that one of the child state changed. An intermediate solution would be to let the host check the state of the children while there are some of them, and update the local build state assynchronously himself. We can actualy use the 'waiting' global state to now if we need to continue to check the build state and result. While a build is not done or running, we need to check all children result and state on case they were updated. One corner case is when rebuilding a child: a new child is added but the parent is maybe not in the 'waiting' global state anymore. If this is the case, we need to recusivelly change the state of the parents to waiting so that they will update again.
1 parent c3d24f3 commit 4958eac

File tree

4 files changed

+98
-73
lines changed

4 files changed

+98
-73
lines changed

runbot/models/build.py

Lines changed: 83 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,9 @@ class BuildResult(models.Model):
163163
trigger_id = fields.Many2one('runbot.trigger', related='params_id.trigger_id', store=True, index=True)
164164

165165
# state machine
166-
global_state = fields.Selection(make_selection(state_order), string='Status', compute='_compute_global_state', store=True, recursive=True)
166+
global_state = fields.Selection(make_selection(state_order), string='Status', default='pending', required=True)
167167
local_state = fields.Selection(make_selection(state_order), string='Build Status', default='pending', required=True, index=True)
168-
global_result = fields.Selection(make_selection(result_order), string='Result', compute='_compute_global_result', store=True, recursive=True)
168+
global_result = fields.Selection(make_selection(result_order), string='Result', default='ok')
169169
local_result = fields.Selection(make_selection(result_order), string='Build Result', default='ok')
170170

171171
requested_action = fields.Selection([('wake_up', 'To wake up'), ('deathrow', 'To kill')], string='Action requested', index=True)
@@ -254,20 +254,6 @@ def _compute_log_list(self): # storing this field because it will be access trh
254254
build.log_list = ','.join({step.name for step in build.params_id.config_id.step_ids() if step._has_log()})
255255
# TODO replace logic, add log file to list when executed (avoid 404, link log on docker start, avoid fake is_docker_step)
256256

257-
@api.depends('children_ids.global_state', 'local_state')
258-
def _compute_global_state(self):
259-
for record in self:
260-
waiting_score = record._get_state_score('waiting')
261-
children_ids = [child for child in record.children_ids if not child.orphan_result]
262-
if record._get_state_score(record.local_state) > waiting_score and children_ids: # if finish, check children
263-
children_state = record._get_youngest_state([child.global_state for child in children_ids])
264-
if record._get_state_score(children_state) > waiting_score:
265-
record.global_state = record.local_state
266-
else:
267-
record.global_state = 'waiting'
268-
else:
269-
record.global_state = record.local_state
270-
271257
@api.depends('gc_delay', 'job_end')
272258
def _compute_gc_date(self):
273259
icp = self.env['ir.config_parameter'].sudo()
@@ -299,22 +285,6 @@ def _get_youngest_state(self, states):
299285
def _get_state_score(self, result):
300286
return state_order.index(result)
301287

302-
@api.depends('children_ids.global_result', 'local_result', 'children_ids.orphan_result')
303-
def _compute_global_result(self):
304-
for record in self:
305-
if record.local_result and record._get_result_score(record.local_result) >= record._get_result_score('ko'):
306-
record.global_result = record.local_result
307-
else:
308-
children_ids = [child for child in record.children_ids if not child.orphan_result]
309-
if children_ids:
310-
children_result = record._get_worst_result([child.global_result for child in children_ids], max_res='ko')
311-
if record.local_result:
312-
record.global_result = record._get_worst_result([record.local_result, children_result])
313-
else:
314-
record.global_result = children_result
315-
else:
316-
record.global_result = record.local_result
317-
318288
def _get_worst_result(self, results, max_res=False):
319289
results = [result for result in results if result] # filter Falsy values
320290
index = max([self._get_result_score(result) for result in results]) if results else 0
@@ -340,37 +310,50 @@ def copy_data(self, default=None):
340310
})
341311
return [values]
342312

313+
@api.model_create_multi
314+
def create(self, vals_list):
315+
for values in vals_list:
316+
if 'local_state' in values:
317+
values['global_state'] = values['local_state']
318+
if 'local_result' in values:
319+
values['global_result'] = values['local_result']
320+
records = super().create(vals_list)
321+
if records.parent_id:
322+
records.parent_id._update_globals()
323+
return records
324+
343325
def write(self, values):
344326
# some validation to ensure db consistency
345327
if 'local_state' in values:
346328
if values['local_state'] == 'done':
347329
self.filtered(lambda b: b.local_state != 'done').commit_export_ids.unlink()
348330

349-
local_result = values.get('local_result')
350-
for build in self:
351-
if local_result and local_result != self._get_worst_result([build.local_result, local_result]): # dont write ok on a warn/error build
352-
if len(self) == 1:
353-
values.pop('local_result')
354-
else:
355-
raise ValidationError('Local result cannot be set to a less critical level')
356-
357-
init_global_results = self.mapped('global_result')
358-
init_global_states = self.mapped('global_state')
359-
res = super(BuildResult, self).write(values)
360-
for init_global_result, build in zip(init_global_results, self):
361-
if init_global_result != build.global_result:
362-
build._github_status()
363-
364-
for init_global_state, build in zip(init_global_states, self):
365-
if not build.parent_id and init_global_state not in ('done', 'running') and build.global_state in ('done', 'running'):
366-
build._github_status()
367-
368-
if values.get('global_state') in ('done', 'running'):
369-
for build in self:
370-
if not build.parent_id and build.global_state not in ('done', 'running'):
371-
build._github_status()
372-
373-
return res
331+
# don't update if the value doesn't change to avoid triggering concurrent updates
332+
def minimal_update(records, field_name):
333+
updated = self.browse()
334+
if field_name in values:
335+
field_result = values.pop(field_name)
336+
updated = records.filtered(lambda b: (b[field_name] != field_result))
337+
if updated:
338+
super(BuildResult, updated).write({field_name: field_result})
339+
return updated
340+
341+
# local result is a special case since we don't only want to avoid an update if the value didn't change, but also if the value is less than the previous one
342+
# example: don't write 'ok' if result is 'ko' or 'warn'
343+
updated = self.browse()
344+
if 'local_result' in values:
345+
to_update = self.filtered(lambda b: (self._get_result_score(values['local_result']) > self._get_result_score(b.local_result)))
346+
updated = minimal_update(to_update, 'local_result')
347+
updated |= minimal_update(self, 'local_state')
348+
updated._update_globals()
349+
parents_to_update = minimal_update(self, 'global_result').parent_id
350+
parents_to_update |= minimal_update(self, 'global_state').parent_id
351+
parents_to_update |= minimal_update(self, 'orphan_result').parent_id
352+
parents_to_update._notify_global_update()
353+
354+
if values:
355+
super().write(values)
356+
return True
374357

375358
def _add_child(self, param_values, orphan=False, description=False, additionnal_commit_links=False):
376359

@@ -476,8 +459,6 @@ def _rebuild(self, message=None):
476459
self.orphan_result = True
477460

478461
new_build = self.create(values)
479-
if self.parent_id:
480-
new_build._github_status()
481462
user = request.env.user if request else self.env.user
482463
new_build._log('rebuild', 'Rebuild initiated by %s%s' % (user.name, (' :%s' % message) if message else ''))
483464

@@ -669,14 +650,56 @@ def _process_requested_actions(self):
669650
run_step = step_ids[-1]
670651
else:
671652
run_step = self.env.ref('runbot.runbot_build_config_step_run')
672-
run_step._run_step(build, log_path, force=True)
653+
run_step._run_step(build, log_path, force=True)()
673654
# reload_nginx will be triggered by _run_run_odoo
674655
except Exception:
675656
_logger.exception('Failed to wake up build %s', build.dest)
676657
build._log('_schedule', 'Failed waking up build', level='ERROR')
677658
build.write({'requested_action': False, 'local_state': 'done'})
678659
continue
679660

661+
def _notify_global_update(self):
662+
for record in self:
663+
if not record.host_id:
664+
record._update_globals()
665+
else:
666+
self.env['runbot.host.message'].create({
667+
'host_id': record.host_id.id,
668+
'build_id': record.id,
669+
'message': 'global_updated',
670+
})
671+
672+
def _update_globals(self):
673+
for record in self:
674+
children = record.children_ids.filtered(lambda child: not child.orphan_result)
675+
global_result = record.local_result
676+
if children:
677+
child_result = record._get_worst_result(children.mapped('global_result'), max_res='ko')
678+
global_result = record._get_worst_result([record.local_result, child_result])
679+
if global_result != record.global_result:
680+
record.global_result = global_result
681+
if not record.parent_id:
682+
record._github_status() # failfast
683+
684+
init_state = record.global_state
685+
testing_children = any(child.global_state not in ('running', 'done') for child in children)
686+
global_state = record.local_state
687+
if testing_children:
688+
child_state = 'waiting'
689+
global_state = record._get_youngest_state([record.local_state, child_state])
690+
691+
if global_state != record.global_state:
692+
record.global_state = global_state
693+
694+
ending_build = init_state not in ('done', 'running') and record.global_state in ('done', 'running')
695+
696+
if ending_build:
697+
if not record.local_result: # Set 'ok' result if no result set (no tests job on build)
698+
record.local_result = 'ok'
699+
record.build_end = now()
700+
if not record.parent_id:
701+
record._github_status()
702+
680703
def _schedule(self):
681704
"""schedule the build"""
682705
icp = self.env['ir.config_parameter'].sudo()
@@ -959,7 +982,6 @@ def _kill(self, result=None):
959982
v['local_result'] = result
960983
build.write(v)
961984
self.env.cr.commit()
962-
build._github_status()
963985
self.invalidate_cache()
964986

965987
def _ask_kill(self, lock=True, message=None):

runbot/models/host.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,12 @@ class MessageQueue(models.Model):
287287

288288
def _process(self):
289289
records = self
290-
# todo consume messages here
290+
global_updates = records.filtered(lambda r: r.message == 'global_updated')
291+
global_updates.build_id._update_globals()
292+
records -= global_updates
293+
# ask_kills = records.filtered(lambda r: r.message == 'ask_kill')
294+
# ask_kills.build_id._ask_kill()
295+
# records -= ask_kills
291296
if records:
292297
for record in records:
293298
self.env['runbot.runbot'].warning(f'Host {record.host_id.name} got an unexpected message {record.message}')

runbot/models/runbot.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,6 @@ def _fetch_loop_turn(self, host, pull_info_failures, default_sleep=1):
275275
_logger.warning('Removing %s from pull_info_failures', pr_number)
276276
del pull_info_failures[pr_number]
277277

278-
279278
return manager.get('sleep', default_sleep)
280279

281280
def _scheduler_loop_turn(self, host, sleep=5):

runbot/tests/test_build.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,9 @@ def test_base_fields(self):
173173

174174
# test a bulk write, that one cannot change from 'ko' to 'ok'
175175
builds = self.Build.browse([build.id, other.id])
176-
with self.assertRaises(ValidationError):
177-
builds.write({'local_result': 'warn'})
178-
# self.assertEqual(build.local_result, 'warn')
179-
# self.assertEqual(other.local_result, 'ko')
176+
builds.write({'local_result': 'warn'})
177+
self.assertEqual(build.local_result, 'warn')
178+
self.assertEqual(other.local_result, 'ko')
180179

181180

182181
def test_markdown_description(self):
@@ -385,16 +384,16 @@ def test_children(self):
385384
with self.assertQueries([]): # no change should be triggered
386385
build1_2.local_state = "testing"
387386

388-
# with self.assertQueries(['''UPDATE "runbot_build" SET "global_state"=%s,"local_state"=%s,"write_date"=%s,"write_uid"=%s WHERE id IN %s''']):
389-
build1.local_state = 'done'
390-
build1.flush()
387+
with self.assertQueries(['''UPDATE "runbot_build" SET "global_state"=%s,"local_state"=%s,"write_date"=%s,"write_uid"=%s WHERE id IN %s''']):
388+
build1.local_state = 'done'
389+
build1.flush()
391390

392391
self.assertEqual('waiting', build1.global_state)
393392
self.assertEqual('testing', build1_1.global_state)
394393

395-
# with self.assertQueries([]): # write the same value, no update should be triggered
396-
build1.local_state = 'done'
397-
build1.flush()
394+
with self.assertQueries([]): # write the same value, no update should be triggered
395+
build1.local_state = 'done'
396+
build1.flush()
398397

399398
build1_1.local_state = 'done'
400399

0 commit comments

Comments
 (0)