Skip to content

Commit 4fa841e

Browse files
committed
Update actor-sqlite-test with abandonAlarm
1 parent b370e0a commit 4fa841e

2 files changed

Lines changed: 96 additions & 0 deletions

File tree

src/workerd/io/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ kj_test(
426426
deps = [
427427
":actor",
428428
":io-gate",
429+
"//src/workerd/util:autogate",
429430
"//src/workerd/util:test",
430431
"//src/workerd/util:test-util",
431432
"@sqlite3",

src/workerd/io/actor-sqlite-test.c++

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2899,5 +2899,100 @@ KJ_TEST("explicit transaction: commit failure breaks output gate even for unconf
28992899
KJ_EXPECT_THROW_MESSAGE("commit failed", promise.wait(test.ws));
29002900
}
29012901

2902+
KJ_TEST("ActorSqlite alarm state is stale after max counted retry failures without fix") {
2903+
// Regression test for STOR-3654 -- demonstrates the BUG (gate OFF).
2904+
2905+
util::Autogate::deinitAutogate();
2906+
2907+
ActorSqliteTest test;
2908+
2909+
test.setAlarm(oneMs);
2910+
test.pollAndExpectCalls({"scheduleRun(1ms)"})[0]->fulfill();
2911+
test.pollAndExpectCalls({"commit"})[0]->fulfill();
2912+
test.pollAndExpectCalls({});
2913+
KJ_ASSERT(expectSync(test.getAlarm()) == oneMs);
2914+
2915+
for (auto i = 0; i < 6 /* WorkerInterface::ALARM_RETRY_MAX_TRIES */; i++) {
2916+
auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime);
2917+
KJ_ASSERT(armResult.is<ActorSqlite::RunAlarmHandler>());
2918+
test.actor.cancelDeferredAlarmDeletion();
2919+
test.pollAndExpectCalls({});
2920+
}
2921+
2922+
// Without fix: getAlarm() still returns oneMs (alarm is still in SQLite).
2923+
// abandonAlarm is a no-op with gate OFF.
2924+
KJ_ASSERT(expectSync(test.getAlarm()) == oneMs);
2925+
}
2926+
2927+
KJ_TEST("ActorSqlite alarm cleared by abandonAlarm after max counted retry failures") {
2928+
// Regression test for STOR-3654 -- demonstrates the FIX (gate ON).
2929+
2930+
util::Autogate::initAutogateNamesForTest({"actor-alarm-abandoned-cleanup"_kj});
2931+
KJ_DEFER(util::Autogate::deinitAutogate());
2932+
2933+
ActorSqliteTest test;
2934+
2935+
test.setAlarm(oneMs);
2936+
test.pollAndExpectCalls({"scheduleRun(1ms)"})[0]->fulfill();
2937+
test.pollAndExpectCalls({"commit"})[0]->fulfill();
2938+
test.pollAndExpectCalls({});
2939+
KJ_ASSERT(expectSync(test.getAlarm()) == oneMs);
2940+
2941+
// Simulate ALARM_RETRY_MAX_TRIES (= 6) counted handler failures.
2942+
for (auto i = 0; i < 6 /* WorkerInterface::ALARM_RETRY_MAX_TRIES */; i++) {
2943+
auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime);
2944+
KJ_ASSERT(armResult.is<ActorSqlite::RunAlarmHandler>());
2945+
test.actor.cancelDeferredAlarmDeletion();
2946+
// Each failure leaves alarm in SQLite (correct for retries 1-5).
2947+
test.pollAndExpectCalls({});
2948+
}
2949+
2950+
// AlarmManager has decided to give up. It calls abandonAlarm() on the actor.
2951+
// With gate ON: setAlarm(null) is called -> commit -> scheduleRun(none) (move-later path).
2952+
test.actor.abandonAlarm(oneMs).wait(test.ws);
2953+
test.pollAndExpectCalls({"commit"})[0]->fulfill();
2954+
test.pollAndExpectCalls({"scheduleRun(none)"})[0]->fulfill();
2955+
test.pollAndExpectCalls({});
2956+
2957+
// getAlarm() now returns null (alarm deleted from SQLite).
2958+
KJ_ASSERT(expectSync(test.getAlarm()) == kj::none);
2959+
}
2960+
2961+
KJ_TEST("ActorSqlite alarm preserved after ALARM_RETRY_MAX_TRIES uncounted (internal) failures") {
2962+
// Regression test for STOR-3654 -- ensures the fix doesn't over-fire for internal errors.
2963+
//
2964+
// When all ALARM_RETRY_MAX_TRIES failures are uncounted (retryCountsAgainstLimit=false,
2965+
// i.e. infrastructure errors), AlarmManager's countedRetry never reaches the limit and
2966+
// abandonAlarm is NEVER called. The alarm must remain set in SQLite throughout so that
2967+
// AlarmManager can keep retrying indefinitely.
2968+
2969+
util::Autogate::initAutogateNamesForTest({"actor-alarm-abandoned-cleanup"_kj});
2970+
KJ_DEFER(util::Autogate::deinitAutogate());
2971+
2972+
ActorSqliteTest test;
2973+
2974+
test.setAlarm(oneMs);
2975+
test.pollAndExpectCalls({"scheduleRun(1ms)"})[0]->fulfill();
2976+
test.pollAndExpectCalls({"commit"})[0]->fulfill();
2977+
test.pollAndExpectCalls({});
2978+
KJ_ASSERT(expectSync(test.getAlarm()) == oneMs);
2979+
2980+
// Simulate uncounted failures well past ALARM_RETRY_MAX_TRIES (= 6).
2981+
// countedRetry stays at 0; AlarmManager never gives up; abandonAlarm is never called.
2982+
// We've seen alarms fail hundreds of times due to infrastructure errors in production,
2983+
// so we check both at the boundary (6) and well beyond it (100).
2984+
for (auto i = 0; i < 100; i++) {
2985+
auto armResult = test.actor.armAlarmHandler(oneMs, nullptr, testCurrentTime);
2986+
KJ_ASSERT(armResult.is<ActorSqlite::RunAlarmHandler>());
2987+
test.actor.cancelDeferredAlarmDeletion();
2988+
test.pollAndExpectCalls({});
2989+
2990+
// Check at the ALARM_RETRY_MAX_TRIES boundary and at the end.
2991+
if (i == 5 || i == 99) {
2992+
KJ_ASSERT(expectSync(test.getAlarm()) == oneMs);
2993+
}
2994+
}
2995+
}
2996+
29022997
} // namespace
29032998
} // namespace workerd

0 commit comments

Comments
 (0)