Skip to content
This repository has been archived by the owner on Oct 23, 2024. It is now read-only.

Commit

Permalink
Exit with 111 if Marathon could not bind to address. (#6962) (#6974)
Browse files Browse the repository at this point in the history
This should ease debugging issues such as COPS-4623.

Co-Authored-By: Andreas Neumann <[email protected]>
  • Loading branch information
2 people authored and Tim Harper committed Jul 17, 2019
1 parent e2f2f2d commit 44d650b
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 16 deletions.
29 changes: 15 additions & 14 deletions docs/docs/exit-codes.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,21 @@ Marathon follows the [Let-it-crash](https://www.reactivedesignpatterns.com/patte
of trying to fix an illegal state it will stop itself to be restarted by its supervisor. The following exit codes should
help you figure out why the Marathon process stopped.

| Exit Code | Reason |
|-----------|-------------------------------------------------------------------------------|
|100 | `ZookeeperConnectionFailure` - Could not connect to Zookeeper |
|101 | `ZookeeperConnectionLost` - Lost connect to Zookeeper |
|102 | `PluginInitializationFailure` - Could not load plugin |
|103 | `LeadershipLoss` - Lost leadership |
|104 | `LeadershipEndedFaulty` - Leadership ended with an error |
|105 | `LeadershipEndedGracefully` - Leadership ended without an error |
|106 | `MesosSchedulerError` - The Mesos scheduler driver had an error |
|107 | `UncaughtException` - An internal unknown error could not be handled |
|108 | `FrameworkIdMissing` The Framework ID could not be read |
|109 | `IncompatibleLibMesos` Provided LibMesos version is incompatible |
|110 | `FrameworkHasBeenRemoved` Framework has been removed via Mesos teardown call |
|137 | Killed by an external process or uncaught exception |
| Exit Code | Reason |
|-----------|------------------------------------------------------------------------------------------------------|
|100 | `ZookeeperConnectionFailure` - Could not connect to Zookeeper |
|101 | `ZookeeperConnectionLost` - Lost connect to Zookeeper |
|102 | `PluginInitializationFailure` - Could not load plugin |
|103 | `LeadershipLoss` - Lost leadership |
|104 | `LeadershipEndedFaulty` - Leadership ended with an error |
|105 | `LeadershipEndedGracefully` - Leadership ended without an error |
|106 | `MesosSchedulerError` - The Mesos scheduler driver had an error |
|107 | `UncaughtException` - An internal unknown error could not be handled |
|108 | `FrameworkIdMissing` The Framework ID could not be read |
|109 | `IncompatibleLibMesos` Provided LibMesos version is incompatible |
|110 | `FrameworkHasBeenRemoved` Framework has been removed via Mesos teardown call |
|111 | `BindingError` Marathon could not bind to the address provided by `--http_address` and `--http_port` |
|137 | Killed by an external process or uncaught exception |

## Troubleshooting Exit Codes

Expand Down
19 changes: 17 additions & 2 deletions src/main/scala/mesosphere/marathon/Main.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package mesosphere.marathon
import java.util.concurrent.TimeUnit

import akka.actor.ActorSystem
import com.google.common.util.concurrent.ServiceManager
import com.google.common.util.concurrent.{Service, ServiceManager}
import com.google.inject.{Guice, Module}
import com.typesafe.scalalogging.StrictLogging
import mesosphere.marathon.api.LeaderProxyFilterModule
Expand All @@ -28,7 +28,10 @@ class MarathonApp(args: Seq[String]) extends AutoCloseable with StrictLogging {
SLF4JBridgeHandler.install()
Thread.setDefaultUncaughtExceptionHandler((thread: Thread, throwable: Throwable) => {
logger.error(s"Terminating due to uncaught exception in thread ${thread.getName}:${thread.getId}", throwable)
Runtime.getRuntime.asyncExit(CrashStrategy.UncaughtException.code)
throwable match {
case e: java.net.BindException => Runtime.getRuntime.asyncExit(CrashStrategy.BindingError.code)
case other => Runtime.getRuntime.asyncExit(CrashStrategy.UncaughtException.code)
}
})

if (LibMesos.isCompatible) {
Expand Down Expand Up @@ -91,6 +94,18 @@ class MarathonApp(args: Seq[String]) extends AutoCloseable with StrictLogging {
try {
serviceManager.foreach(_.awaitHealthy())
} catch {
case ie: IllegalStateException =>
logger.error(s"Failed to start all services. Services by state: ${serviceManager.map(_.servicesByState()).getOrElse("[]")}", ie)

// Try to find a failed service, if we have one, rethrow the failure cause to handle it in the
// unchecked exception handler.
services.find(_.state() == Service.State.FAILED).foreach(failedService => {
throw failedService.failureCause()
})

// Otherwise just shutdown and rethrow the original exception
shutdownAndWait()
throw ie
case e: Exception =>
logger.error(s"Failed to start all services. Services by state: ${serviceManager.map(_.servicesByState()).getOrElse("[]")}", e)
shutdownAndWait()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ object CrashStrategy {
case object FrameworkIdMissing extends Reason { override val code: Int = 108 }
case object IncompatibleLibMesos extends Reason { override val code: Int = 109 }
case object FrameworkHasBeenRemoved extends Reason { override val code: Int = 110 }
case object BindingError extends Reason { override val code: Int = 111 }
}

case object JvmExitsCrashStrategy extends CrashStrategy {
Expand Down

0 comments on commit 44d650b

Please sign in to comment.