@@ -2899,5 +2899,100 @@ KJ_TEST("explicit transaction: commit failure breaks output gate even for unconf
28992899 KJ_EXPECT_THROW_MESSAGE (" commit failed" , promise.wait (test.ws ));
29002900}
29012901
2902+ KJ_TEST (" ActorSqlite alarm state is stale after max counted retry failures without fix" ) {
2903+ // Regression test for STOR-3654 -- demonstrates the BUG (gate OFF).
2904+
2905+ util::Autogate::deinitAutogate ();
2906+
2907+ ActorSqliteTest test;
2908+
2909+ test.setAlarm (oneMs);
2910+ test.pollAndExpectCalls ({" scheduleRun(1ms)" })[0 ]->fulfill ();
2911+ test.pollAndExpectCalls ({" commit" })[0 ]->fulfill ();
2912+ test.pollAndExpectCalls ({});
2913+ KJ_ASSERT (expectSync (test.getAlarm ()) == oneMs);
2914+
2915+ for (auto i = 0 ; i < 6 /* WorkerInterface::ALARM_RETRY_MAX_TRIES */ ; i++) {
2916+ auto armResult = test.actor .armAlarmHandler (oneMs, nullptr , testCurrentTime);
2917+ KJ_ASSERT (armResult.is <ActorSqlite::RunAlarmHandler>());
2918+ test.actor .cancelDeferredAlarmDeletion ();
2919+ test.pollAndExpectCalls ({});
2920+ }
2921+
2922+ // Without fix: getAlarm() still returns oneMs (alarm is still in SQLite).
2923+ // abandonAlarm is a no-op with gate OFF.
2924+ KJ_ASSERT (expectSync (test.getAlarm ()) == oneMs);
2925+ }
2926+
2927+ KJ_TEST (" ActorSqlite alarm cleared by abandonAlarm after max counted retry failures" ) {
2928+ // Regression test for STOR-3654 -- demonstrates the FIX (gate ON).
2929+
2930+ util::Autogate::initAutogateNamesForTest ({" actor-alarm-abandoned-cleanup" _kj});
2931+ KJ_DEFER (util::Autogate::deinitAutogate ());
2932+
2933+ ActorSqliteTest test;
2934+
2935+ test.setAlarm (oneMs);
2936+ test.pollAndExpectCalls ({" scheduleRun(1ms)" })[0 ]->fulfill ();
2937+ test.pollAndExpectCalls ({" commit" })[0 ]->fulfill ();
2938+ test.pollAndExpectCalls ({});
2939+ KJ_ASSERT (expectSync (test.getAlarm ()) == oneMs);
2940+
2941+ // Simulate ALARM_RETRY_MAX_TRIES (= 6) counted handler failures.
2942+ for (auto i = 0 ; i < 6 /* WorkerInterface::ALARM_RETRY_MAX_TRIES */ ; i++) {
2943+ auto armResult = test.actor .armAlarmHandler (oneMs, nullptr , testCurrentTime);
2944+ KJ_ASSERT (armResult.is <ActorSqlite::RunAlarmHandler>());
2945+ test.actor .cancelDeferredAlarmDeletion ();
2946+ // Each failure leaves alarm in SQLite (correct for retries 1-5).
2947+ test.pollAndExpectCalls ({});
2948+ }
2949+
2950+ // AlarmManager has decided to give up. It calls abandonAlarm() on the actor.
2951+ // With gate ON: setAlarm(null) is called -> commit -> scheduleRun(none) (move-later path).
2952+ test.actor .abandonAlarm (oneMs).wait (test.ws );
2953+ test.pollAndExpectCalls ({" commit" })[0 ]->fulfill ();
2954+ test.pollAndExpectCalls ({" scheduleRun(none)" })[0 ]->fulfill ();
2955+ test.pollAndExpectCalls ({});
2956+
2957+ // getAlarm() now returns null (alarm deleted from SQLite).
2958+ KJ_ASSERT (expectSync (test.getAlarm ()) == kj::none);
2959+ }
2960+
2961+ KJ_TEST (" ActorSqlite alarm preserved after ALARM_RETRY_MAX_TRIES uncounted (internal) failures" ) {
2962+ // Regression test for STOR-3654 -- ensures the fix doesn't over-fire for internal errors.
2963+ //
2964+ // When all ALARM_RETRY_MAX_TRIES failures are uncounted (retryCountsAgainstLimit=false,
2965+ // i.e. infrastructure errors), AlarmManager's countedRetry never reaches the limit and
2966+ // abandonAlarm is NEVER called. The alarm must remain set in SQLite throughout so that
2967+ // AlarmManager can keep retrying indefinitely.
2968+
2969+ util::Autogate::initAutogateNamesForTest ({" actor-alarm-abandoned-cleanup" _kj});
2970+ KJ_DEFER (util::Autogate::deinitAutogate ());
2971+
2972+ ActorSqliteTest test;
2973+
2974+ test.setAlarm (oneMs);
2975+ test.pollAndExpectCalls ({" scheduleRun(1ms)" })[0 ]->fulfill ();
2976+ test.pollAndExpectCalls ({" commit" })[0 ]->fulfill ();
2977+ test.pollAndExpectCalls ({});
2978+ KJ_ASSERT (expectSync (test.getAlarm ()) == oneMs);
2979+
2980+ // Simulate uncounted failures well past ALARM_RETRY_MAX_TRIES (= 6).
2981+ // countedRetry stays at 0; AlarmManager never gives up; abandonAlarm is never called.
2982+ // We've seen alarms fail hundreds of times due to infrastructure errors in production,
2983+ // so we check both at the boundary (6) and well beyond it (100).
2984+ for (auto i = 0 ; i < 100 ; i++) {
2985+ auto armResult = test.actor .armAlarmHandler (oneMs, nullptr , testCurrentTime);
2986+ KJ_ASSERT (armResult.is <ActorSqlite::RunAlarmHandler>());
2987+ test.actor .cancelDeferredAlarmDeletion ();
2988+ test.pollAndExpectCalls ({});
2989+
2990+ // Check at the ALARM_RETRY_MAX_TRIES boundary and at the end.
2991+ if (i == 5 || i == 99 ) {
2992+ KJ_ASSERT (expectSync (test.getAlarm ()) == oneMs);
2993+ }
2994+ }
2995+ }
2996+
29022997} // namespace
29032998} // namespace workerd
0 commit comments