Skip to content

Commit

Permalink
[7.4.0] Mark remote cache errors as retriable by expanding `--incompa…
Browse files Browse the repository at this point in the history
…tible_re… (bazelbuild#23426)

…mote_use_new_exit_code_for_lost_inputs`

If `--incompatible_remote_use_new_exit_code_for_lost_inputs` is set, or
if `--experimental_remote_cache_eviction_retries` is set, all remote
cache errors during cache lookup will be (eligible for) being retried by
restarting the build. This does not apply to remote cache errors during
remote execution, though.

Fixes bazelbuild#23033.

Closes bazelbuild#23079.

PiperOrigin-RevId: 667544073
Change-Id: Ia61c6e94b0d64842b930a7fbb62c68d6b86d7ce9

Commit
bazelbuild@9392f8f

Co-authored-by: Cornelius Riemenschneider <[email protected]>
  • Loading branch information
iancha1992 and criemen authored Aug 27, 2024
1 parent d131e38 commit 5f5d70b
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -289,17 +289,26 @@ public ListenableFuture<Void> prefetchInputs()
Priority.MEDIUM),
BulkTransferException.class,
(BulkTransferException e) -> {
if (BulkTransferException.allCausedByCacheNotFoundException(e)) {
var code =
(executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0)
? Code.REMOTE_CACHE_EVICTED
: Code.REMOTE_CACHE_FAILED;
if (executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnTransientCacheError > 0) {
var message =
BulkTransferException.allCausedByCacheNotFoundException(e)
? "Failed to fetch blobs because they do not exist remotely."
: "Failed to fetch blobs because of a remote cache error.";
throw new EnvironmentalExecException(
e,
FailureDetail.newBuilder()
.setMessage(message)
.setSpawn(
FailureDetails.Spawn.newBuilder().setCode(Code.REMOTE_CACHE_EVICTED))
.build());
} else if (BulkTransferException.allCausedByCacheNotFoundException(e)) {
throw new EnvironmentalExecException(
e,
FailureDetail.newBuilder()
.setMessage("Failed to fetch blobs because they do not exist remotely.")
.setSpawn(FailureDetails.Spawn.newBuilder().setCode(code))
.setSpawn(
FailureDetails.Spawn.newBuilder().setCode(Code.REMOTE_CACHE_FAILED))
.build());
} else {
throw e;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,24 +511,28 @@ public boolean usingLocalTestJobs() {
effectTags = {OptionEffectTag.UNKNOWN},
metadataTags = {OptionMetadataTag.INCOMPATIBLE_CHANGE},
help =
"If set to true, Bazel will use new exit code 39 instead of 34 if remote cache evicts"
+ " blobs during the build.")
"If set to true, Bazel will use new exit code 39 instead of 34 if remote cache"
+ "errors, including cache evictions, cause the build to fail.")
public boolean useNewExitCodeForLostInputs;

@Option(
// TODO: when this flag is moved to non-experimental, rename it to a more general name
// to reflect the new logic - it's not only about cache evictions.
name = "experimental_remote_cache_eviction_retries",
defaultValue = "0",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.EXECUTION},
help =
"The maximum number of attempts to retry if the build encountered remote cache eviction"
+ " error. A non-zero value will implicitly set"
"The maximum number of attempts to retry if the build encountered a transient remote"
+ " cache error that would otherwise fail the build. Applies for example when"
+ " artifacts are evicted from the remote cache, or in certain cache failure"
+ " conditions. A non-zero value will implicitly set"
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true. A new invocation"
+ " id will be generated for each attempt. If you generate invocation id and provide"
+ " it to Bazel with --invocation_id, you should not use this flag. Instead, set flag"
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs and check for the exit"
+ " code 39.")
public int remoteRetryOnCacheEviction;
public int remoteRetryOnTransientCacheError;

/** An enum for specifying different formats of test output. */
public enum TestOutputFormat {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ private SpawnResult handleError(
} else if (remoteCacheFailed) {
status = Status.REMOTE_CACHE_FAILED;
if (executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0) {
|| executionOptions.remoteRetryOnTransientCacheError > 0) {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_EVICTED;
} else {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_FAILED;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ public BlazeCommandResult exec(
attemptedCommandIds,
lastResult);
break;
} catch (RemoteCacheEvictedException e) {
} catch (RemoteCacheTransientErrorException e) {
attemptedCommandIds.add(e.getCommandId());
lastResult = e.getResult();
}
Expand Down Expand Up @@ -308,7 +308,7 @@ private BlazeCommandResult execExclusively(
List<Any> commandExtensions,
Set<UUID> attemptedCommandIds,
@Nullable BlazeCommandResult lastResult)
throws RemoteCacheEvictedException {
throws RemoteCacheTransientErrorException {
// Record the start time for the profiler. Do not put anything before this!
long execStartTimeNanos = runtime.getClock().nanoTime();

Expand Down Expand Up @@ -344,7 +344,7 @@ private BlazeCommandResult execExclusively(
env.getCommandId()));
return Preconditions.checkNotNull(lastResult);
} else {
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
outErr.printErrLn("Found transient remote cache error, retrying the build...");
}
}

Expand Down Expand Up @@ -689,13 +689,13 @@ private BlazeCommandResult execExclusively(
if (newResult.getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) {
var executionOptions =
Preconditions.checkNotNull(options.getOptions(ExecutionOptions.class));
if (attemptedCommandIds.size() < executionOptions.remoteRetryOnCacheEviction) {
throw new RemoteCacheEvictedException(env.getCommandId(), newResult);
if (attemptedCommandIds.size() < executionOptions.remoteRetryOnTransientCacheError) {
throw new RemoteCacheTransientErrorException(env.getCommandId(), newResult);
}
}

return newResult;
} catch (RemoteCacheEvictedException e) {
} catch (RemoteCacheTransientErrorException e) {
throw e;
} catch (Throwable e) {
logger.atSevere().withCause(e).log("Shutting down due to exception");
Expand Down Expand Up @@ -730,11 +730,11 @@ private BlazeCommandResult execExclusively(
}
}

private static class RemoteCacheEvictedException extends IOException {
private static class RemoteCacheTransientErrorException extends IOException {
private final UUID commandId;
private final BlazeCommandResult result;

private RemoteCacheEvictedException(UUID commandId, BlazeCommandResult result) {
private RemoteCacheTransientErrorException(UUID commandId, BlazeCommandResult result) {
this.commandId = commandId;
this.result = result;
}
Expand Down
5 changes: 5 additions & 0 deletions src/main/protobuf/failure_details.proto
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,11 @@ message Spawn {
// refactored to prohibit undetailed failures
UNSPECIFIED_EXECUTION_FAILURE = 12 [(metadata) = { exit_code: 1 }];
FORBIDDEN_INPUT = 13 [(metadata) = { exit_code: 1 }];
// This also includes other remote cache errors, not just evictions,
// if --incompatible_remote_use_new_exit_code_for_lost_inputs is set.
// TODO: Rename it to a more general name when
// --experimental_remote_cache_eviction_retries is moved to
// non-experimental.
REMOTE_CACHE_EVICTED = 14 [(metadata) = { exit_code: 39 }];
SPAWN_LOG_IO_EXCEPTION = 15 [(metadata) = { exit_code: 36 }];
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2022,7 +2022,7 @@ EOF
//a:bar >& $TEST_log || fail "Failed to build"

expect_log 'Failed to fetch blobs because they do not exist remotely.'
expect_log "Found remote cache eviction error, retrying the build..."
expect_log "Found transient remote cache error, retrying the build..."

local invocation_ids=$(grep "Invocation ID:" $TEST_log)
local first_id=$(echo "$invocation_ids" | head -n 1)
Expand Down

0 comments on commit 5f5d70b

Please sign in to comment.