From 8a036d63c7f3f0f6ec483fe96cf3487484429cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Lang=C3=A9?= Date: Wed, 28 Oct 2020 09:40:23 +0100 Subject: [PATCH] Properly release lock on all scale checks Scale checks are done regularly, to validate that running instances number matches expected number of instances in runSpec. There are two cases: * There are missing instances, in that case marathon starts new instances * There are too many instances, in that case marathon kills overdue instances In second case, lock is supposed to be released only when overdue instances are dead. We encountered issues where lock was never released because KillStreamWatKillStreamWatcher.watchForKilledTasks() future was never ending. This is because of a typo in the code, making it wait for ALL instances to die, instead of just the overdue subset. --- src/main/scala/mesosphere/marathon/MarathonSchedulerActor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/mesosphere/marathon/MarathonSchedulerActor.scala b/src/main/scala/mesosphere/marathon/MarathonSchedulerActor.scala index 970e497e354..a1552f52af0 100644 --- a/src/main/scala/mesosphere/marathon/MarathonSchedulerActor.scala +++ b/src/main/scala/mesosphere/marathon/MarathonSchedulerActor.scala @@ -452,7 +452,7 @@ class SchedulerActions( if (instancesToDecommission.nonEmpty) { logger.info(s"Adjusting goals for instances ${instancesToDecommission.map(_.instanceId)} (${GoalChangeReason.OverCapacity})") - val instancesAreTerminal = KillStreamWatcher.watchForKilledTasks(instanceTracker.instanceUpdates, instances).runWith(Sink.ignore) + val instancesAreTerminal = KillStreamWatcher.watchForKilledTasks(instanceTracker.instanceUpdates, instancesToDecommission).runWith(Sink.ignore) // Race condition with line 421. The instances we loaded might not exist anymore, e.g. the agent // might have been removed and the instance expunged.