Skip to content

Commit

Permalink
Add option to return SUCCEED when training is completed with some fai…
Browse files Browse the repository at this point in the history
…led job tasks
  • Loading branch information
Yuechen Chen committed Jan 17, 2020
1 parent 9dcae46 commit be28773
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ public static String getContainerDockerKey() {
// Job types that we will short circuit when it failed
public static final String STOP_ON_FAILURE_JOBTYPES = TONY_APPLICATION_PREFIX + "stop.on.failure.jobtypes";

// Whether to return FAILED when training is completed with some failed job tasks
public static final String FAILED_ON_COMPLETED_WITH_FAILURE_ENABLED = TONY_APPLICATION_PREFIX
+ "failed_on_complete_with_failure.enabled";
public static final boolean DEFAULT_FAILED_ON_COMPLETED_WITH_FAILURE_ENABLED = true;

// Training chief configuration
public static final String CHIEF_PREFIX = TONY_PREFIX + "chief.";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import com.google.common.base.Preconditions;
import com.linkedin.tony.Constants;
import com.linkedin.tony.TonyConfigurationKeys;
import com.linkedin.tony.rpc.TaskInfo;
import com.linkedin.tony.rpc.impl.TaskStatus;
import com.linkedin.tony.util.Utils;
Expand Down Expand Up @@ -262,7 +263,12 @@ public void onTaskCompleted(String jobName, String jobIndex, int exitCode) {
if (isChief(jobName, jobIndex) || shouldStopOnFailure(jobName)) {
trainingFinished = true;
}
setFinalStatus(FinalApplicationStatus.FAILED, "Exit status: " + exitCode);
boolean failedOnCompletedWithFailure = tonyConf.getBoolean(
TonyConfigurationKeys.FAILED_ON_COMPLETED_WITH_FAILURE_ENABLED,
TonyConfigurationKeys.DEFAULT_FAILED_ON_COMPLETED_WITH_FAILURE_ENABLED);
if (failedOnCompletedWithFailure) {
setFinalStatus(FinalApplicationStatus.FAILED, "Exit status: " + exitCode);
}
}
}

Expand Down Expand Up @@ -305,7 +311,10 @@ public void updateSessionStatus() {
}
}

if (failureCount > 0) {
boolean failedOnCompletedWithFailure = tonyConf.getBoolean(
TonyConfigurationKeys.FAILED_ON_COMPLETED_WITH_FAILURE_ENABLED,
TonyConfigurationKeys.DEFAULT_FAILED_ON_COMPLETED_WITH_FAILURE_ENABLED);
if (failedOnCompletedWithFailure && failureCount > 0) {
setFinalStatus(FinalApplicationStatus.FAILED,
"At least one job task exited with non-zero status, failedCnt=" + failureCount);
} else {
Expand Down
7 changes: 7 additions & 0 deletions tony-core/src/main/resources/tony-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,13 @@
<value>ps</value>
</property>

<!-- App returned status configurations -->
<property>
<description>Whether to return FAILED when training is completed with some failed job tasks</description>
<name>tony.application.failed_on_complete_with_failure.enabled</name>
<value>true</value>
</property>

<!-- Docker configurations -->
<property>
<description>Whether we use docker container to launch the tasks</description>
Expand Down

0 comments on commit be28773

Please sign in to comment.