From 308b6dd8bc23ea3dcc3607fe12e27d923fe48517 Mon Sep 17 00:00:00 2001 From: Googler Date: Fri, 9 Aug 2024 10:38:05 -0700 Subject: [PATCH] Make it possible to get a disk/remote cache hit for a test following a failure of the same test. A source edit might have occurred in the interim, putting the test back into a passing state that has been previously stored in the cache. The current logic disables disk/remote caching when rerunning a test following a failure, which causes spurious reexecutions (see issue #11057). Although this code has been around for a very long time, it seems that it was originally added to ensure that a flaky test failure doesn't become sticky. However, for this purpose it suffices to bust the local cache, as a disk/remote cache should never store failures (if it did, this would not be a complete fix, and we'd have much bigger problems elsewhere). Note that Bazel itself never uploads local failures to a disk/remote cache, but can't control whether a remote executor does. The fix is to split the `TestRunnerAction.canBeCached` logic into two versions: one that takes into account a previous test result (for the local cache check) and another that doesn't (to determine whether we should attempt to hit the disk/remote cache). Fixes #11057. PiperOrigin-RevId: 661319835 Change-Id: I9248cbfa31dd135b2bda971e48bb17d7f828889c (cherry picked from commit e9709b7d4b42aa3315d02bd658013508b2cb66f3) --- .../lib/analysis/test/TestRunnerAction.java | 131 +++++++++--------- .../lib/exec/StandaloneTestStrategy.java | 5 +- src/test/shell/bazel/disk_cache_test.sh | 38 ++++- 3 files changed, 105 insertions(+), 69 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/analysis/test/TestRunnerAction.java b/src/main/java/com/google/devtools/build/lib/analysis/test/TestRunnerAction.java index c431b15ed63d24..52c4ab456d1afe 100644 --- a/src/main/java/com/google/devtools/build/lib/analysis/test/TestRunnerAction.java +++ b/src/main/java/com/google/devtools/build/lib/analysis/test/TestRunnerAction.java @@ -501,11 +501,46 @@ protected void computeKey( fp.addStringMap(getExecutionInfo()); } + /** + * Returns whether the test should be executed unconditionally based on the test configuration, + * the test properties, and the previous test result when known. + */ @Override public boolean executeUnconditionally() { // Note: isVolatile must return true if executeUnconditionally can ever return true // for this instance. - return computeExecuteUnconditionallyFromTestStatus(); + return executeUnconditionally( + testConfiguration.cacheTestResults(), + this::maybeReadCacheStatus, + testProperties.isExternal(), + executionSettings.getTotalRuns()); + } + + @VisibleForTesting + static boolean executeUnconditionally( + TriState cacheTestResults, + Supplier> prevStatus, // lazy to avoid I/O if possible + boolean isExternal, + int runsPerTest) { + if (!shouldCacheResult(cacheTestResults, isExternal, runsPerTest)) { + return true; + } + Optional status = prevStatus.get(); + if (status.isEmpty()) { + // Execute unconditionally if a previous test result is not available. + return true; + } + if (!status.get().getCachable()) { + // Execute unconditionally if the previous test result was marked non-cacheable. + // It seems that this can only happen with --experimental_cancel_concurrent_tests. + return true; + } + if (cacheTestResults == TriState.AUTO && !status.get().getTestPassed()) { + // Execute unconditionally if the previous test result was a failure, as otherwise we can + // get stuck forever in the event of a flaky failure. + return true; + } + return false; } @Override @@ -553,72 +588,41 @@ TestResultData readCacheStatus() throws IOException { } } - private boolean computeExecuteUnconditionallyFromTestStatus() { - CacheableTest cacheStatus = - canBeCached( - testConfiguration.cacheTestResults(), - this::maybeReadCacheStatus, - testProperties.isExternal(), - executionSettings.getTotalRuns()); - switch (cacheStatus) { - case NO_STATUS_ON_DISK: - // execute unconditionally if no status available on disk - case NO: - return true; - case YES: - return false; - } - throw new IllegalStateException("Unreachable. Bad cache status: " + cacheStatus); + /** + * Returns whether a cached result should be accepted from a disk/remote cache, depending on the + * test configuration and test properties. + * + *

This should *not* be used to determine whether to accept a cached result from the action + * cache. Call {@link #executeUnconditionally} instead. + * + *

Unlike {@link #executeUnconditionally}, this decision does not depend on the previous test + * result, as otherwise we wouldn't attempt to hit the disk/remote cache when the test has changed + * from a failing to a passing state since the last execution without causing the action to be + * reanalyzed (for example, by editing a source file into a passing state that has been previously + * seen). + * + *

We're not concerned about a flaky failure becoming sticky in the disk/remote cache, because + * it's impossible to solve this problem generally. In any case, this can only occur with a remote + * execution implementation that caches failures, as we never upload them to a disk/remote cache + * ourselves. + */ + public boolean shouldCacheResult() { + return shouldCacheResult( + testConfiguration.cacheTestResults(), + testProperties.isExternal(), + executionSettings.getTotalRuns()); } @VisibleForTesting - static CacheableTest canBeCached( - TriState cacheTestResults, - Supplier> - prevStatus, // Lazy evaluation to avoid a disk read if possible. - boolean isExternal, - int runsPerTest) { + static boolean shouldCacheResult( + TriState cacheTestResults, boolean isExternal, int runsPerTest) { if (isExternal || cacheTestResults == TriState.NO) { - return CacheableTest.NO; + return false; } if (cacheTestResults == TriState.AUTO && runsPerTest > 1) { - return CacheableTest.NO; - } - Optional status = prevStatus.get(); - // unable to read status from disk - if (status.isEmpty()) { - return CacheableTest.NO_STATUS_ON_DISK; - } - if (!status.get().getCachable()) { - return CacheableTest.NO; - } - if (cacheTestResults == TriState.AUTO && !status.get().getTestPassed()) { - return CacheableTest.NO; - } - return CacheableTest.YES; - } - - /** - * Returns whether caching has been deemed safe by looking at the previous test run (for local - * caching). If the previous run is not present or cached status was retrieved unsuccessfully, - * return "true" here, as remote execution caching should be safe. - */ - public boolean shouldCacheResult() { - CacheableTest cacheStatus = - canBeCached( - testConfiguration.cacheTestResults(), - this::maybeReadCacheStatus, - testProperties.isExternal(), - executionSettings.getTotalRuns()); - switch (cacheStatus) { - // optimistically cache results if status unavailable - case YES: - case NO_STATUS_ON_DISK: - return true; - case NO: - return false; + return false; } - throw new IllegalStateException("Unreachable. Bad cache status: " + cacheStatus); + return true; } @Override @@ -1298,11 +1302,4 @@ public boolean isEmpty() { return false; } } - - @VisibleForTesting - enum CacheableTest { - YES, - NO, - NO_STATUS_ON_DISK - } } diff --git a/src/main/java/com/google/devtools/build/lib/exec/StandaloneTestStrategy.java b/src/main/java/com/google/devtools/build/lib/exec/StandaloneTestStrategy.java index b30534f161bed1..6e1a9dc2beadd7 100644 --- a/src/main/java/com/google/devtools/build/lib/exec/StandaloneTestStrategy.java +++ b/src/main/java/com/google/devtools/build/lib/exec/StandaloneTestStrategy.java @@ -109,7 +109,10 @@ public TestRunnerSpawn createTestRunnerSpawn( Map executionInfo = new TreeMap<>(action.getTestProperties().getExecutionInfo()); - if (!action.shouldCacheResult()) { + if (!action.shouldAcceptCachedResult()) { + // TODO(tjgq): We want to reject a previously cached result, but not prevent the result of the + // current execution from being uploaded. We should introduce a separate execution requirement + // for this. executionInfo.put(ExecutionRequirements.NO_CACHE, ""); } executionInfo.put(ExecutionRequirements.TIMEOUT, "" + getTimeout(action).getSeconds()); diff --git a/src/test/shell/bazel/disk_cache_test.sh b/src/test/shell/bazel/disk_cache_test.sh index 05c6546ebe5f73..f4f3c3012e9034 100755 --- a/src/test/shell/bazel/disk_cache_test.sh +++ b/src/test/shell/bazel/disk_cache_test.sh @@ -91,4 +91,40 @@ EOF } -run_suite "local action cache test" +function test_cache_hit_on_source_edit_after_test_failure() { + # Regression test for https://github.com/bazelbuild/bazel/issues/11057. + + local -r CACHE_DIR="${TEST_TMPDIR}/cache" + rm -rf "$CACHE_DIR" + + mkdir -p a + cat > a/BUILD <<'EOF' +sh_test( + name = "test", + srcs = ["test.sh"], +) +EOF + echo "exit 0" > a/test.sh + chmod +x a/test.sh + + # Populate the cache with a passing test. + bazel test --disk_cache="$CACHE_DIR" //a:test >& $TEST_log \ + || fail "Expected test to pass" + + # Turn the test into a failing one. + echo "exit 1" > a/test.sh + + # Check that the test fails. + bazel test --disk_cache="$CACHE_DIR" //a:test >& $TEST_log \ + && fail "Expected test to fail" + + # Turn the test into a passing one again. + echo "exit 0" > a/test.sh + + # Check that we hit the previously populated cache. + bazel test --disk_cache="$CACHE_DIR" //a:test >& $TEST_log \ + || fail "Expected test to pass" + expect_log "(cached) PASSED" +} + +run_suite "disk cache test"