From 44d650b4c645f8f5e0f56b6c5f3bf9f40819d902 Mon Sep 17 00:00:00 2001 From: Karsten Jeschkies Date: Wed, 17 Jul 2019 21:09:28 +0200 Subject: [PATCH] Exit with 111 if Marathon could not bind to address. (#6962) (#6974) This should ease debugging issues such as COPS-4623. Co-Authored-By: Andreas Neumann --- docs/docs/exit-codes.md | 29 ++++++++++--------- src/main/scala/mesosphere/marathon/Main.scala | 19 ++++++++++-- .../marathon/core/base/CrashStrategy.scala | 1 + 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/docs/docs/exit-codes.md b/docs/docs/exit-codes.md index 6f93dc9aba7..8c36d6488d0 100644 --- a/docs/docs/exit-codes.md +++ b/docs/docs/exit-codes.md @@ -8,20 +8,21 @@ Marathon follows the [Let-it-crash](https://www.reactivedesignpatterns.com/patte of trying to fix an illegal state it will stop itself to be restarted by its supervisor. The following exit codes should help you figure out why the Marathon process stopped. -| Exit Code | Reason | -|-----------|-------------------------------------------------------------------------------| -|100 | `ZookeeperConnectionFailure` - Could not connect to Zookeeper | -|101 | `ZookeeperConnectionLost` - Lost connect to Zookeeper | -|102 | `PluginInitializationFailure` - Could not load plugin | -|103 | `LeadershipLoss` - Lost leadership | -|104 | `LeadershipEndedFaulty` - Leadership ended with an error | -|105 | `LeadershipEndedGracefully` - Leadership ended without an error | -|106 | `MesosSchedulerError` - The Mesos scheduler driver had an error | -|107 | `UncaughtException` - An internal unknown error could not be handled | -|108 | `FrameworkIdMissing` The Framework ID could not be read | -|109 | `IncompatibleLibMesos` Provided LibMesos version is incompatible | -|110 | `FrameworkHasBeenRemoved` Framework has been removed via Mesos teardown call | -|137 | Killed by an external process or uncaught exception | +| Exit Code | Reason | +|-----------|------------------------------------------------------------------------------------------------------| +|100 | `ZookeeperConnectionFailure` - Could not connect to Zookeeper | +|101 | `ZookeeperConnectionLost` - Lost connect to Zookeeper | +|102 | `PluginInitializationFailure` - Could not load plugin | +|103 | `LeadershipLoss` - Lost leadership | +|104 | `LeadershipEndedFaulty` - Leadership ended with an error | +|105 | `LeadershipEndedGracefully` - Leadership ended without an error | +|106 | `MesosSchedulerError` - The Mesos scheduler driver had an error | +|107 | `UncaughtException` - An internal unknown error could not be handled | +|108 | `FrameworkIdMissing` The Framework ID could not be read | +|109 | `IncompatibleLibMesos` Provided LibMesos version is incompatible | +|110 | `FrameworkHasBeenRemoved` Framework has been removed via Mesos teardown call | +|111 | `BindingError` Marathon could not bind to the address provided by `--http_address` and `--http_port` | +|137 | Killed by an external process or uncaught exception | ## Troubleshooting Exit Codes diff --git a/src/main/scala/mesosphere/marathon/Main.scala b/src/main/scala/mesosphere/marathon/Main.scala index 3732d68f4c4..8c920e6c20a 100644 --- a/src/main/scala/mesosphere/marathon/Main.scala +++ b/src/main/scala/mesosphere/marathon/Main.scala @@ -3,7 +3,7 @@ package mesosphere.marathon import java.util.concurrent.TimeUnit import akka.actor.ActorSystem -import com.google.common.util.concurrent.ServiceManager +import com.google.common.util.concurrent.{Service, ServiceManager} import com.google.inject.{Guice, Module} import com.typesafe.scalalogging.StrictLogging import mesosphere.marathon.api.LeaderProxyFilterModule @@ -28,7 +28,10 @@ class MarathonApp(args: Seq[String]) extends AutoCloseable with StrictLogging { SLF4JBridgeHandler.install() Thread.setDefaultUncaughtExceptionHandler((thread: Thread, throwable: Throwable) => { logger.error(s"Terminating due to uncaught exception in thread ${thread.getName}:${thread.getId}", throwable) - Runtime.getRuntime.asyncExit(CrashStrategy.UncaughtException.code) + throwable match { + case e: java.net.BindException => Runtime.getRuntime.asyncExit(CrashStrategy.BindingError.code) + case other => Runtime.getRuntime.asyncExit(CrashStrategy.UncaughtException.code) + } }) if (LibMesos.isCompatible) { @@ -91,6 +94,18 @@ class MarathonApp(args: Seq[String]) extends AutoCloseable with StrictLogging { try { serviceManager.foreach(_.awaitHealthy()) } catch { + case ie: IllegalStateException => + logger.error(s"Failed to start all services. Services by state: ${serviceManager.map(_.servicesByState()).getOrElse("[]")}", ie) + + // Try to find a failed service, if we have one, rethrow the failure cause to handle it in the + // unchecked exception handler. + services.find(_.state() == Service.State.FAILED).foreach(failedService => { + throw failedService.failureCause() + }) + + // Otherwise just shutdown and rethrow the original exception + shutdownAndWait() + throw ie case e: Exception => logger.error(s"Failed to start all services. Services by state: ${serviceManager.map(_.servicesByState()).getOrElse("[]")}", e) shutdownAndWait() diff --git a/src/main/scala/mesosphere/marathon/core/base/CrashStrategy.scala b/src/main/scala/mesosphere/marathon/core/base/CrashStrategy.scala index 5ade11d7313..efbf1855a6b 100644 --- a/src/main/scala/mesosphere/marathon/core/base/CrashStrategy.scala +++ b/src/main/scala/mesosphere/marathon/core/base/CrashStrategy.scala @@ -24,6 +24,7 @@ object CrashStrategy { case object FrameworkIdMissing extends Reason { override val code: Int = 108 } case object IncompatibleLibMesos extends Reason { override val code: Int = 109 } case object FrameworkHasBeenRemoved extends Reason { override val code: Int = 110 } + case object BindingError extends Reason { override val code: Int = 111 } } case object JvmExitsCrashStrategy extends CrashStrategy {