From 572b3c759e76d920d505ea76c2e85eec746f9261 Mon Sep 17 00:00:00 2001 From: Jake Merdich Date: Fri, 2 Sep 2022 13:09:01 -0400 Subject: [PATCH 1/2] Get suspends from logind to ignore watchdog then On every resume, the watchdog thread happily reports that there has been a deadlock since the canary hasn't responded since before the computer resumed. Get notifications from logind over dbus to tell when suspend happens and ignore the one watchdog violation that might happen then. Depending on how threads are scheduled on resume (canary scheduled first), the watchdog after resume might not happen and the next real deadlock will take two watchdog cycles to hit, which is not horrible in my mind but might be fixable by having the ignore condition have a timeout for N seconds after resume. --- rtkit-daemon.c | 62 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/rtkit-daemon.c b/rtkit-daemon.c index 17122fa..ff841b2 100644 --- a/rtkit-daemon.c +++ b/rtkit-daemon.c @@ -195,6 +195,7 @@ static const char *proc = NULL; static int quit_fd = -1, canary_fd = -1; static pthread_t canary_thread_id = 0, watchdog_thread_id = 0; static volatile uint32_t refuse_until = 0; +static volatile bool ignore_next_deadlock = false; static const char *get_proc_path(void) { /* Useful for chroot environments */ @@ -1256,6 +1257,32 @@ static int handle_dbus_prop_get(const char* property, DBusMessage *r) { return 0; } +static DBusHandlerResult dbus_sleep_handler(DBusConnection *c, DBusMessage *m, void *userdata) { + (void)c; + (void)userdata; + + DBusHandlerResult result = DBUS_HANDLER_RESULT_HANDLED; + DBusError error; + dbus_error_init(&error); + + if (dbus_message_is_signal(m, "org.freedesktop.login1.Manager", "PrepareForSleep")) { + bool isSleep = false; // true=sleep, false=resume + if (dbus_message_get_args(m, &error, DBUS_TYPE_BOOLEAN, &isSleep, DBUS_TYPE_INVALID)) { + if (isSleep) { + ignore_next_deadlock = true; + __sync_synchronize(); + } + } + } + else + { + result = DBUS_HANDLER_RESULT_NOT_YET_HANDLED; + } + + dbus_error_free(&error); + return result; +} + static DBusHandlerResult dbus_handler(DBusConnection *c, DBusMessage *m, void *userdata) { DBusError error; DBusMessage *r = NULL; @@ -1475,7 +1502,16 @@ static int setup_dbus(DBusConnection **c) { goto fail; } + dbus_bus_add_match(*c, "type='signal',interface='org.freedesktop.login1.Manager',member='PrepareForSleep'", &error); + if (dbus_error_is_set(&error)) + { + syslog(LOG_ERR, "Failed to add signal on bus: %s\n", error.message); + abort(); + goto fail; + } + assert_se(dbus_connection_register_object_path(*c, RTKIT_OBJECT_PATH, &vtable, NULL)); + assert_se(dbus_connection_add_filter(*c, dbus_sleep_handler, NULL, NULL)); return 0; @@ -1621,14 +1657,26 @@ static void* watchdog_thread(void *data) { if (TIMESPEC_MSEC(last_cheep) + canary_watchdog_msec <= TIMESPEC_MSEC(now)) { last_cheep = now; - syslog(LOG_WARNING, "The canary thread is apparently starving. Taking action.\n"); - refuse_until = (uint32_t) now.tv_sec + canary_refusal_sec; - __sync_synchronize(); - - if (canary_demote_unknown) - reset_all(); + if (ignore_next_deadlock) + { + // On sleep-resume, we get a spurious long time interval from when the system was sleeping. + // This only can happen once per resume and real deadlocks that happen here will be detected + // at the next watchdog interval. + syslog(LOG_INFO, "The canary thread is apparently starving; ignoring once due to system resume.\n"); + ignore_next_deadlock = false; + __sync_synchronize(); + } else - reset_known(); + { + syslog(LOG_WARNING, "The canary thread is apparently starving. Taking action.\n"); + refuse_until = (uint32_t) now.tv_sec + canary_refusal_sec; + __sync_synchronize(); + + if (canary_demote_unknown) + reset_all(); + else + reset_known(); + } continue; } } From dd8e5e9170d4e43a937aead4a1569803efc8bfe3 Mon Sep 17 00:00:00 2001 From: Jake Merdich Date: Fri, 2 Sep 2022 14:23:55 -0400 Subject: [PATCH 2/2] Add a timeout for resume-deadlock ignores On my personal computer, the scheduling race condition mentioned in the last commit happens about ~1/4 of the time. That's too often to ignore. Only make the resume-ignore condition apply for a few seconds after resume. Additionally, this feature can now be disabled by setting a threshold of zero. --- rtkit-daemon.c | 55 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/rtkit-daemon.c b/rtkit-daemon.c index ff841b2..4a427eb 100644 --- a/rtkit-daemon.c +++ b/rtkit-daemon.c @@ -145,6 +145,9 @@ static unsigned canary_watchdog_realtime_priority = 99; /* How long after the canary died shall we refuse further RT requests? */ static unsigned canary_refusal_sec = 5*60; +/* How long after resume do we ignore canary clock jumps? */ +static unsigned canary_resume_ignore_sec = 2; + /* Demote root processes? */ static bool canary_demote_root = FALSE; @@ -195,7 +198,7 @@ static const char *proc = NULL; static int quit_fd = -1, canary_fd = -1; static pthread_t canary_thread_id = 0, watchdog_thread_id = 0; static volatile uint32_t refuse_until = 0; -static volatile bool ignore_next_deadlock = false; +static volatile uint32_t ignore_deadlock_until = 0; static const char *get_proc_path(void) { /* Useful for chroot environments */ @@ -1269,9 +1272,17 @@ static DBusHandlerResult dbus_sleep_handler(DBusConnection *c, DBusMessage *m, v bool isSleep = false; // true=sleep, false=resume if (dbus_message_get_args(m, &error, DBUS_TYPE_BOOLEAN, &isSleep, DBUS_TYPE_INVALID)) { if (isSleep) { - ignore_next_deadlock = true; - __sync_synchronize(); + // Ignore deadlocks indefinitely; will fixup on first deadlock or resume + ignore_deadlock_until = UINT32_MAX; + } + else if (ignore_deadlock_until == UINT32_MAX) + { + struct timespec now; + assert_se(clock_gettime(CLOCK_MONOTONIC, &now) == 0); + ignore_deadlock_until = (uint32_t) now.tv_sec + canary_resume_ignore_sec; + } + __sync_synchronize(); } } else @@ -1502,17 +1513,19 @@ static int setup_dbus(DBusConnection **c) { goto fail; } - dbus_bus_add_match(*c, "type='signal',interface='org.freedesktop.login1.Manager',member='PrepareForSleep'", &error); - if (dbus_error_is_set(&error)) + assert_se(dbus_connection_register_object_path(*c, RTKIT_OBJECT_PATH, &vtable, NULL)); + + if (canary_resume_ignore_sec) { - syslog(LOG_ERR, "Failed to add signal on bus: %s\n", error.message); - abort(); - goto fail; + dbus_bus_add_match(*c, "type='signal',interface='org.freedesktop.login0.Manager',member='PrepareForSleep'", &error); + if (dbus_error_is_set(&error)) + { + syslog(LOG_ERR, "Failed to add signal on bus: %s\n", error.message); + goto fail; + } + assert_se(dbus_connection_add_filter(*c, dbus_sleep_handler, NULL, NULL)); } - assert_se(dbus_connection_register_object_path(*c, RTKIT_OBJECT_PATH, &vtable, NULL)); - assert_se(dbus_connection_add_filter(*c, dbus_sleep_handler, NULL, NULL)); - return 0; fail: @@ -1657,13 +1670,13 @@ static void* watchdog_thread(void *data) { if (TIMESPEC_MSEC(last_cheep) + canary_watchdog_msec <= TIMESPEC_MSEC(now)) { last_cheep = now; - if (ignore_next_deadlock) + if (now.tv_sec < (time_t) ignore_deadlock_until) { // On sleep-resume, we get a spurious long time interval from when the system was sleeping. // This only can happen once per resume and real deadlocks that happen here will be detected // at the next watchdog interval. syslog(LOG_INFO, "The canary thread is apparently starving; ignoring once due to system resume.\n"); - ignore_next_deadlock = false; + ignore_deadlock_until = 0; __sync_synchronize(); } else @@ -1923,6 +1936,7 @@ enum { ARG_CANARY_DEMOTE_ROOT, ARG_CANARY_DEMOTE_UNKNOWN, ARG_CANARY_REFUSE_SEC, + ARG_CANARY_RESUME_IGNORE_SEC, ARG_STDERR, ARG_INTROSPECT }; @@ -1952,6 +1966,7 @@ static const struct option long_options[] = { { "canary-demote-root", no_argument, 0, ARG_CANARY_DEMOTE_ROOT }, { "canary-demote-unknown", no_argument, 0, ARG_CANARY_DEMOTE_UNKNOWN }, { "canary-refuse-sec", required_argument, 0, ARG_CANARY_REFUSE_SEC }, + { "canary-resume-ignore-sec", required_argument, 0, ARG_CANARY_RESUME_IGNORE_SEC }, { "stderr", no_argument, 0, ARG_STDERR }, { "introspect", no_argument, 0, ARG_INTROSPECT }, { NULL, 0, 0, 0} @@ -2266,6 +2281,20 @@ static int parse_command_line(int argc, char *argv[], int *ret) { break; } + case ARG_CANARY_RESUME_IGNORE_SEC: { + char *e = NULL; + unsigned long u; + + errno = 0; + u = strtoul(optarg, &e, 0); + if (errno != 0 || !e || *e) { + fprintf(stderr, "--canary-resume-ignore-sec parameter invalid.\n"); + return -1; + } + canary_resume_ignore_sec = (uint32_t) u; + break; + } + case ARG_STDERR: log_stderr = TRUE; break;