From 5f5d70b6c4d2fb1a889479569107f1692239e8a7 Mon Sep 17 00:00:00 2001 From: "Ian (Hee) Cha" Date: Tue, 27 Aug 2024 01:31:54 -0700 Subject: [PATCH] =?UTF-8?q?[7.4.0]=20Mark=20remote=20cache=20errors=20as?= =?UTF-8?q?=20retriable=20by=20expanding=20`--incompatible=5Fre=E2=80=A6?= =?UTF-8?q?=20(#23426)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …mote_use_new_exit_code_for_lost_inputs` If `--incompatible_remote_use_new_exit_code_for_lost_inputs` is set, or if `--experimental_remote_cache_eviction_retries` is set, all remote cache errors during cache lookup will be (eligible for) being retried by restarting the build. This does not apply to remote cache errors during remote execution, though. Fixes https://github.com/bazelbuild/bazel/issues/23033. Closes #23079. PiperOrigin-RevId: 667544073 Change-Id: Ia61c6e94b0d64842b930a7fbb62c68d6b86d7ce9 Commit https://github.com/bazelbuild/bazel/commit/9392f8f24eadebcbd5c88035e1eaa5b4429191d7 Co-authored-by: Cornelius Riemenschneider --- .../build/lib/exec/AbstractSpawnStrategy.java | 23 +++++++++++++------ .../build/lib/exec/ExecutionOptions.java | 14 +++++++---- .../build/lib/remote/RemoteSpawnRunner.java | 2 +- .../lib/runtime/BlazeCommandDispatcher.java | 16 ++++++------- src/main/protobuf/failure_details.proto | 5 ++++ .../remote/build_without_the_bytes_test.sh | 2 +- 6 files changed, 40 insertions(+), 22 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/exec/AbstractSpawnStrategy.java b/src/main/java/com/google/devtools/build/lib/exec/AbstractSpawnStrategy.java index 05bf13996af656..10e4258f4b482e 100644 --- a/src/main/java/com/google/devtools/build/lib/exec/AbstractSpawnStrategy.java +++ b/src/main/java/com/google/devtools/build/lib/exec/AbstractSpawnStrategy.java @@ -289,17 +289,26 @@ public ListenableFuture prefetchInputs() Priority.MEDIUM), BulkTransferException.class, (BulkTransferException e) -> { - if (BulkTransferException.allCausedByCacheNotFoundException(e)) { - var code = - (executionOptions.useNewExitCodeForLostInputs - || executionOptions.remoteRetryOnCacheEviction > 0) - ? Code.REMOTE_CACHE_EVICTED - : Code.REMOTE_CACHE_FAILED; + if (executionOptions.useNewExitCodeForLostInputs + || executionOptions.remoteRetryOnTransientCacheError > 0) { + var message = + BulkTransferException.allCausedByCacheNotFoundException(e) + ? "Failed to fetch blobs because they do not exist remotely." + : "Failed to fetch blobs because of a remote cache error."; + throw new EnvironmentalExecException( + e, + FailureDetail.newBuilder() + .setMessage(message) + .setSpawn( + FailureDetails.Spawn.newBuilder().setCode(Code.REMOTE_CACHE_EVICTED)) + .build()); + } else if (BulkTransferException.allCausedByCacheNotFoundException(e)) { throw new EnvironmentalExecException( e, FailureDetail.newBuilder() .setMessage("Failed to fetch blobs because they do not exist remotely.") - .setSpawn(FailureDetails.Spawn.newBuilder().setCode(code)) + .setSpawn( + FailureDetails.Spawn.newBuilder().setCode(Code.REMOTE_CACHE_FAILED)) .build()); } else { throw e; diff --git a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java index 6210747c99286a..bb4fdc52c06ffd 100644 --- a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java +++ b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java @@ -511,24 +511,28 @@ public boolean usingLocalTestJobs() { effectTags = {OptionEffectTag.UNKNOWN}, metadataTags = {OptionMetadataTag.INCOMPATIBLE_CHANGE}, help = - "If set to true, Bazel will use new exit code 39 instead of 34 if remote cache evicts" - + " blobs during the build.") + "If set to true, Bazel will use new exit code 39 instead of 34 if remote cache" + + "errors, including cache evictions, cause the build to fail.") public boolean useNewExitCodeForLostInputs; @Option( + // TODO: when this flag is moved to non-experimental, rename it to a more general name + // to reflect the new logic - it's not only about cache evictions. name = "experimental_remote_cache_eviction_retries", defaultValue = "0", documentationCategory = OptionDocumentationCategory.REMOTE, effectTags = {OptionEffectTag.EXECUTION}, help = - "The maximum number of attempts to retry if the build encountered remote cache eviction" - + " error. A non-zero value will implicitly set" + "The maximum number of attempts to retry if the build encountered a transient remote" + + " cache error that would otherwise fail the build. Applies for example when" + + " artifacts are evicted from the remote cache, or in certain cache failure" + + " conditions. A non-zero value will implicitly set" + " --incompatible_remote_use_new_exit_code_for_lost_inputs to true. A new invocation" + " id will be generated for each attempt. If you generate invocation id and provide" + " it to Bazel with --invocation_id, you should not use this flag. Instead, set flag" + " --incompatible_remote_use_new_exit_code_for_lost_inputs and check for the exit" + " code 39.") - public int remoteRetryOnCacheEviction; + public int remoteRetryOnTransientCacheError; /** An enum for specifying different formats of test output. */ public enum TestOutputFormat { diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java index 934cdff7382f15..0d7d6c7110e8d3 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java @@ -616,7 +616,7 @@ private SpawnResult handleError( } else if (remoteCacheFailed) { status = Status.REMOTE_CACHE_FAILED; if (executionOptions.useNewExitCodeForLostInputs - || executionOptions.remoteRetryOnCacheEviction > 0) { + || executionOptions.remoteRetryOnTransientCacheError > 0) { detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_EVICTED; } else { detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_FAILED; diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java index b20a1bb41bba6c..509e1524393000 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java @@ -254,7 +254,7 @@ public BlazeCommandResult exec( attemptedCommandIds, lastResult); break; - } catch (RemoteCacheEvictedException e) { + } catch (RemoteCacheTransientErrorException e) { attemptedCommandIds.add(e.getCommandId()); lastResult = e.getResult(); } @@ -308,7 +308,7 @@ private BlazeCommandResult execExclusively( List commandExtensions, Set attemptedCommandIds, @Nullable BlazeCommandResult lastResult) - throws RemoteCacheEvictedException { + throws RemoteCacheTransientErrorException { // Record the start time for the profiler. Do not put anything before this! long execStartTimeNanos = runtime.getClock().nanoTime(); @@ -344,7 +344,7 @@ private BlazeCommandResult execExclusively( env.getCommandId())); return Preconditions.checkNotNull(lastResult); } else { - outErr.printErrLn("Found remote cache eviction error, retrying the build..."); + outErr.printErrLn("Found transient remote cache error, retrying the build..."); } } @@ -689,13 +689,13 @@ private BlazeCommandResult execExclusively( if (newResult.getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) { var executionOptions = Preconditions.checkNotNull(options.getOptions(ExecutionOptions.class)); - if (attemptedCommandIds.size() < executionOptions.remoteRetryOnCacheEviction) { - throw new RemoteCacheEvictedException(env.getCommandId(), newResult); + if (attemptedCommandIds.size() < executionOptions.remoteRetryOnTransientCacheError) { + throw new RemoteCacheTransientErrorException(env.getCommandId(), newResult); } } return newResult; - } catch (RemoteCacheEvictedException e) { + } catch (RemoteCacheTransientErrorException e) { throw e; } catch (Throwable e) { logger.atSevere().withCause(e).log("Shutting down due to exception"); @@ -730,11 +730,11 @@ private BlazeCommandResult execExclusively( } } - private static class RemoteCacheEvictedException extends IOException { + private static class RemoteCacheTransientErrorException extends IOException { private final UUID commandId; private final BlazeCommandResult result; - private RemoteCacheEvictedException(UUID commandId, BlazeCommandResult result) { + private RemoteCacheTransientErrorException(UUID commandId, BlazeCommandResult result) { this.commandId = commandId; this.result = result; } diff --git a/src/main/protobuf/failure_details.proto b/src/main/protobuf/failure_details.proto index 1e85251ba12a3d..3886f627521f90 100644 --- a/src/main/protobuf/failure_details.proto +++ b/src/main/protobuf/failure_details.proto @@ -226,6 +226,11 @@ message Spawn { // refactored to prohibit undetailed failures UNSPECIFIED_EXECUTION_FAILURE = 12 [(metadata) = { exit_code: 1 }]; FORBIDDEN_INPUT = 13 [(metadata) = { exit_code: 1 }]; + // This also includes other remote cache errors, not just evictions, + // if --incompatible_remote_use_new_exit_code_for_lost_inputs is set. + // TODO: Rename it to a more general name when + // --experimental_remote_cache_eviction_retries is moved to + // non-experimental. REMOTE_CACHE_EVICTED = 14 [(metadata) = { exit_code: 39 }]; SPAWN_LOG_IO_EXCEPTION = 15 [(metadata) = { exit_code: 36 }]; } diff --git a/src/test/shell/bazel/remote/build_without_the_bytes_test.sh b/src/test/shell/bazel/remote/build_without_the_bytes_test.sh index 45dd0007c60151..d800d7a50c6b85 100755 --- a/src/test/shell/bazel/remote/build_without_the_bytes_test.sh +++ b/src/test/shell/bazel/remote/build_without_the_bytes_test.sh @@ -2022,7 +2022,7 @@ EOF //a:bar >& $TEST_log || fail "Failed to build" expect_log 'Failed to fetch blobs because they do not exist remotely.' - expect_log "Found remote cache eviction error, retrying the build..." + expect_log "Found transient remote cache error, retrying the build..." local invocation_ids=$(grep "Invocation ID:" $TEST_log) local first_id=$(echo "$invocation_ids" | head -n 1)