From 308b6dd8bc23ea3dcc3607fe12e27d923fe48517 Mon Sep 17 00:00:00 2001
From: Googler <tjgq@google.com>
Date: Fri, 9 Aug 2024 10:38:05 -0700
Subject: [PATCH] Make it possible to get a disk/remote cache hit for a test
 following a failure of the same test.

A source edit might have occurred in the interim, putting the test back into a passing state that has been previously stored in the cache. The current logic disables disk/remote caching when rerunning a test following a failure, which causes spurious reexecutions (see issue #11057).

Although this code has been around for a very long time, it seems that it was originally added to ensure that a flaky test failure doesn't become sticky. However, for this purpose it suffices to bust the local cache, as a disk/remote cache should never store failures (if it did, this would not be a complete fix, and we'd have much bigger problems elsewhere). Note that Bazel itself never uploads local failures to a disk/remote cache, but can't control whether a remote executor does.

The fix is to split the `TestRunnerAction.canBeCached` logic into two versions: one that takes into account a previous test result (for the local cache check) and another that doesn't (to determine whether we should attempt to hit the disk/remote cache).

Fixes #11057.

PiperOrigin-RevId: 661319835
Change-Id: I9248cbfa31dd135b2bda971e48bb17d7f828889c
(cherry picked from commit e9709b7d4b42aa3315d02bd658013508b2cb66f3)
---
 .../lib/analysis/test/TestRunnerAction.java   | 131 +++++++++---------
 .../lib/exec/StandaloneTestStrategy.java      |   5 +-
 src/test/shell/bazel/disk_cache_test.sh       |  38 ++++-
 3 files changed, 105 insertions(+), 69 deletions(-)
diff --git a/src/main/java/com/google/devtools/build/lib/analysis/test/TestRunnerAction.java b/src/main/java/com/google/devtools/build/lib/analysis/test/TestRunnerAction.java
index c431b15ed63d24..52c4ab456d1afe 100644
--- a/src/main/java/com/google/devtools/build/lib/analysis/test/TestRunnerAction.java
+++ b/src/main/java/com/google/devtools/build/lib/analysis/test/TestRunnerAction.java
@@ -501,11 +501,46 @@ protected void computeKey(
     fp.addStringMap(getExecutionInfo());
   }
 
+  /**
+   * Returns whether the test should be executed unconditionally based on the test configuration,
+   * the test properties, and the previous test result when known.
+   */
   @Override
   public boolean executeUnconditionally() {
     // Note: isVolatile must return true if executeUnconditionally can ever return true
     // for this instance.
-    return computeExecuteUnconditionallyFromTestStatus();
+    return executeUnconditionally(
+        testConfiguration.cacheTestResults(),
+        this::maybeReadCacheStatus,
+        testProperties.isExternal(),
+        executionSettings.getTotalRuns());
+  }
+
+  @VisibleForTesting
+  static boolean executeUnconditionally(
+      TriState cacheTestResults,
+      Supplier<Optional<TestResultData>> prevStatus, // lazy to avoid I/O if possible
+      boolean isExternal,
+      int runsPerTest) {
+    if (!shouldCacheResult(cacheTestResults, isExternal, runsPerTest)) {
+      return true;
+    }
+    Optional<TestResultData> status = prevStatus.get();
+    if (status.isEmpty()) {
+      // Execute unconditionally if a previous test result is not available.
+      return true;
+    }
+    if (!status.get().getCachable()) {
+      // Execute unconditionally if the previous test result was marked non-cacheable.
+      // It seems that this can only happen with --experimental_cancel_concurrent_tests.
+      return true;
+    }
+    if (cacheTestResults == TriState.AUTO && !status.get().getTestPassed()) {
+      // Execute unconditionally if the previous test result was a failure, as otherwise we can
+      // get stuck forever in the event of a flaky failure.
+      return true;
+    }
+    return false;
   }
 
   @Override
@@ -553,72 +588,41 @@ TestResultData readCacheStatus() throws IOException {
     }
   }
 
-  private boolean computeExecuteUnconditionallyFromTestStatus() {
-    CacheableTest cacheStatus =
-        canBeCached(
-            testConfiguration.cacheTestResults(),
-            this::maybeReadCacheStatus,
-            testProperties.isExternal(),
-            executionSettings.getTotalRuns());
-    switch (cacheStatus) {
-      case NO_STATUS_ON_DISK:
-        // execute unconditionally if no status available on disk
-      case NO:
-        return true;
-      case YES:
-        return false;
-    }
-    throw new IllegalStateException("Unreachable. Bad cache status: " + cacheStatus);
+  /**
+   * Returns whether a cached result should be accepted from a disk/remote cache, depending on the
+   * test configuration and test properties.
+   *
+   * <p>This should *not* be used to determine whether to accept a cached result from the action
+   * cache. Call {@link #executeUnconditionally} instead.
+   *
+   * <p>Unlike {@link #executeUnconditionally}, this decision does not depend on the previous test
+   * result, as otherwise we wouldn't attempt to hit the disk/remote cache when the test has changed
+   * from a failing to a passing state since the last execution without causing the action to be
+   * reanalyzed (for example, by editing a source file into a passing state that has been previously
+   * seen).
+   *
+   * <p>We're not concerned about a flaky failure becoming sticky in the disk/remote cache, because
+   * it's impossible to solve this problem generally. In any case, this can only occur with a remote
+   * execution implementation that caches failures, as we never upload them to a disk/remote cache
+   * ourselves.
+   */
+  public boolean shouldCacheResult() {
+    return shouldCacheResult(
+        testConfiguration.cacheTestResults(),
+        testProperties.isExternal(),
+        executionSettings.getTotalRuns());
   }
 
   @VisibleForTesting
-  static CacheableTest canBeCached(
-      TriState cacheTestResults,
-      Supplier<Optional<TestResultData>>
-          prevStatus, // Lazy evaluation to avoid a disk read if possible.
-      boolean isExternal,
-      int runsPerTest) {
+  static boolean shouldCacheResult(
+      TriState cacheTestResults, boolean isExternal, int runsPerTest) {
     if (isExternal || cacheTestResults == TriState.NO) {
-      return CacheableTest.NO;
+      return false;
     }
     if (cacheTestResults == TriState.AUTO && runsPerTest > 1) {
-      return CacheableTest.NO;
-    }
-    Optional<TestResultData> status = prevStatus.get();
-    // unable to read status from disk
-    if (status.isEmpty()) {
-      return CacheableTest.NO_STATUS_ON_DISK;
-    }
-    if (!status.get().getCachable()) {
-      return CacheableTest.NO;
-    }
-    if (cacheTestResults == TriState.AUTO && !status.get().getTestPassed()) {
-      return CacheableTest.NO;
-    }
-    return CacheableTest.YES;
-  }
-
-  /**
-   * Returns whether caching has been deemed safe by looking at the previous test run (for local
-   * caching). If the previous run is not present or cached status was retrieved unsuccessfully,
-   * return "true" here, as remote execution caching should be safe.
-   */
-  public boolean shouldCacheResult() {
-    CacheableTest cacheStatus =
-        canBeCached(
-            testConfiguration.cacheTestResults(),
-            this::maybeReadCacheStatus,
-            testProperties.isExternal(),
-            executionSettings.getTotalRuns());
-    switch (cacheStatus) {
-        // optimistically cache results if status unavailable
-      case YES:
-      case NO_STATUS_ON_DISK:
-        return true;
-      case NO:
-        return false;
+      return false;
     }
-    throw new IllegalStateException("Unreachable. Bad cache status: " + cacheStatus);
+    return true;
   }
 
   @Override
@@ -1298,11 +1302,4 @@ public boolean isEmpty() {
       return false;
     }
   }
-
-  @VisibleForTesting
-  enum CacheableTest {
-    YES,
-    NO,
-    NO_STATUS_ON_DISK
-  }
 }
diff --git a/src/main/java/com/google/devtools/build/lib/exec/StandaloneTestStrategy.java b/src/main/java/com/google/devtools/build/lib/exec/StandaloneTestStrategy.java
index b30534f161bed1..6e1a9dc2beadd7 100644
--- a/src/main/java/com/google/devtools/build/lib/exec/StandaloneTestStrategy.java
+++ b/src/main/java/com/google/devtools/build/lib/exec/StandaloneTestStrategy.java
@@ -109,7 +109,10 @@ public TestRunnerSpawn createTestRunnerSpawn(
 
     Map<String, String> executionInfo =
         new TreeMap<>(action.getTestProperties().getExecutionInfo());
-    if (!action.shouldCacheResult()) {
+    if (!action.shouldAcceptCachedResult()) {
+      // TODO(tjgq): We want to reject a previously cached result, but not prevent the result of the
+      // current execution from being uploaded. We should introduce a separate execution requirement
+      // for this.
       executionInfo.put(ExecutionRequirements.NO_CACHE, "");
     }
     executionInfo.put(ExecutionRequirements.TIMEOUT, "" + getTimeout(action).getSeconds());
diff --git a/src/test/shell/bazel/disk_cache_test.sh b/src/test/shell/bazel/disk_cache_test.sh
index 05c6546ebe5f73..f4f3c3012e9034 100755
--- a/src/test/shell/bazel/disk_cache_test.sh
+++ b/src/test/shell/bazel/disk_cache_test.sh
@@ -91,4 +91,40 @@ EOF
 
 }
 
-run_suite "local action cache test"
+function test_cache_hit_on_source_edit_after_test_failure() {
+  # Regression test for https://github.com/bazelbuild/bazel/issues/11057.
+
+  local -r CACHE_DIR="${TEST_TMPDIR}/cache"
+  rm -rf "$CACHE_DIR"
+
+  mkdir -p a
+  cat > a/BUILD <<'EOF'
+sh_test(
+    name = "test",
+    srcs = ["test.sh"],
+)
+EOF
+  echo "exit 0" > a/test.sh
+  chmod +x a/test.sh
+
+  # Populate the cache with a passing test.
+  bazel test --disk_cache="$CACHE_DIR" //a:test >& $TEST_log \
+    || fail "Expected test to pass"
+
+  # Turn the test into a failing one.
+  echo "exit 1" > a/test.sh
+
+  # Check that the test fails.
+  bazel test --disk_cache="$CACHE_DIR" //a:test >& $TEST_log \
+    && fail "Expected test to fail"
+
+  # Turn the test into a passing one again.
+  echo "exit 0" > a/test.sh
+
+  # Check that we hit the previously populated cache.
+  bazel test --disk_cache="$CACHE_DIR" //a:test >& $TEST_log \
+    || fail "Expected test to pass"
+  expect_log "(cached) PASSED"
+}
+
+run_suite "disk cache test"