From ffe630128aaa638a05ff27b4153047a9a197f903 Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Sat, 9 Dec 2023 16:34:59 -0600 Subject: [PATCH 01/12] Add check for stuck jobs in poll() --- client/app.cpp | 1 + client/app.h | 2 ++ client/app_control.cpp | 19 +++++++++++++++++++ client/client_state.h | 3 +++ lib/error_numbers.h | 1 + 5 files changed, 26 insertions(+) diff --git a/client/app.cpp b/client/app.cpp index a0254c277f3..650ed42998e 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -114,6 +114,7 @@ ACTIVE_TASK::ACTIVE_TASK() { fraction_done_elapsed_time = 0; first_fraction_done = 0; first_fraction_done_elapsed_time = 0; + stuck_fraction_done = 0; scheduler_state = CPU_SCHED_UNINITIALIZED; next_scheduler_state = CPU_SCHED_UNINITIALIZED; signal = 0; diff --git a/client/app.h b/client/app.h index 1999acaeb11..824b7909501 100644 --- a/client/app.h +++ b/client/app.h @@ -112,6 +112,8 @@ struct ACTIVE_TASK { // first frac done reported during this run of task double first_fraction_done_elapsed_time; // elapsed time when the above was reported + double stuck_fraction_done; + // keeps track if fraction has change to ensure it is not stuck SCHEDULER_STATE scheduler_state; SCHEDULER_STATE next_scheduler_state; // temp int signal; diff --git a/client/app_control.cpp b/client/app_control.cpp index b03a9875db1..78a94a739a6 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -149,6 +149,25 @@ bool ACTIVE_TASK_SET::poll() { } } } + + // check if a job is "stuck" (did not make progress in the last hour) + // notify the user about the issue + // abort after some time + static double last_fraction_done_check_time = 0; + if (gstate.now - last_fraction_done_check_time > FRACTION_DONE_POLL_PERIOD) { + for (i=0; istuck_fraction_done == atp->fraction_done + && atp->current_cpu_time < 10) { + msg_printf(atp->result->project, MSG_INFO, "Task has not made progress in last hour, consider aborting"); + // atp->abort_task(EXIT_TASK_STUCK, "Task has not made progress in last hour, begin to abort"); + } + atp->stuck_fraction_done = atp->fraction_done; + } + last_fraction_done_check_time = gstate.now; + } + if (action) { gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll"); } diff --git a/client/client_state.h b/client/client_state.h index c1bbb2b9e91..ff7c34a8cbf 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -601,6 +601,9 @@ extern THREAD throttle_thread; #define MEMORY_USAGE_PERIOD 10 // computer memory usage and check for exclusive apps this often +#define FRACTION_DONE_POLL_PERIOD 3600 + // poll if fraction done has change within an hour to see if it is stuck + //////// WORK FETCH #define WORK_FETCH_PERIOD 60 diff --git a/lib/error_numbers.h b/lib/error_numbers.h index d6110acc3e9..5008b1e9924 100644 --- a/lib/error_numbers.h +++ b/lib/error_numbers.h @@ -42,6 +42,7 @@ #define EXIT_INIT_FAILURE 206 #define EXIT_NO_SUB_TASKS 207 #define EXIT_SUB_TASK_FAILURE 208 +#define EXIT_TASK_STUCK 209 // Function return values. // NOTE: add new errors to the end of the list and don't change From 6a51c304979ec8953784977dd79e3712e47cb2fd Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Sat, 9 Dec 2023 16:42:51 -0600 Subject: [PATCH 02/12] Fix trailing whitespace --- client/app_control.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/app_control.cpp b/client/app_control.cpp index 78a94a739a6..f495849a3d1 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -158,7 +158,7 @@ bool ACTIVE_TASK_SET::poll() { for (i=0; istuck_fraction_done == atp->fraction_done + if (last_fraction_done_check_time == 0 && atp->stuck_fraction_done == atp->fraction_done && atp->current_cpu_time < 10) { msg_printf(atp->result->project, MSG_INFO, "Task has not made progress in last hour, consider aborting"); // atp->abort_task(EXIT_TASK_STUCK, "Task has not made progress in last hour, begin to abort"); From 27f75944a4c4c5a0a14f3d5b24a315902d852108 Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Sat, 9 Dec 2023 19:37:48 -0600 Subject: [PATCH 03/12] Update logic for stuck check based on feedback --- client/app.cpp | 4 +++- client/app.h | 8 ++++++-- client/app_control.cpp | 26 ++++++++++++++++++-------- client/client_state.h | 2 +- lib/error_numbers.h | 1 - 5 files changed, 28 insertions(+), 13 deletions(-) diff --git a/client/app.cpp b/client/app.cpp index 650ed42998e..df288cebc56 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -114,7 +114,9 @@ ACTIVE_TASK::ACTIVE_TASK() { fraction_done_elapsed_time = 0; first_fraction_done = 0; first_fraction_done_elapsed_time = 0; - stuck_fraction_done = 0; + stuck_check_fraction_done = 0; + stuck_check_elapsed_time = 0; + stuck_check_cpu_time = 0; scheduler_state = CPU_SCHED_UNINITIALIZED; next_scheduler_state = CPU_SCHED_UNINITIALIZED; signal = 0; diff --git a/client/app.h b/client/app.h index 824b7909501..8a42cff9436 100644 --- a/client/app.h +++ b/client/app.h @@ -112,8 +112,12 @@ struct ACTIVE_TASK { // first frac done reported during this run of task double first_fraction_done_elapsed_time; // elapsed time when the above was reported - double stuck_fraction_done; - // keeps track if fraction has change to ensure it is not stuck + double stuck_check_fraction_done; + // fraction done since last check for stuck + double stuck_check_elapsed_time; + // elapsed time at last stuck check + double stuck_check_cpu_time; + // cpu time at last check SCHEDULER_STATE scheduler_state; SCHEDULER_STATE next_scheduler_state; // temp int signal; diff --git a/client/app_control.cpp b/client/app_control.cpp index f495849a3d1..137df48e195 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -153,19 +153,29 @@ bool ACTIVE_TASK_SET::poll() { // check if a job is "stuck" (did not make progress in the last hour) // notify the user about the issue // abort after some time - static double last_fraction_done_check_time = 0; - if (gstate.now - last_fraction_done_check_time > FRACTION_DONE_POLL_PERIOD) { + static double last_stuck_check_time = 0; + if (gstate.now - last_stuck_check_time > STUCK_CHECK_POLL_PERIOD) { for (i=0; istuck_fraction_done == atp->fraction_done - && atp->current_cpu_time < 10) { + if (!atp->non_cpu_intensive()) continue; + if (atp->sporadic()) continue; + if (atp->stuck_check_elapsed_time == 0) { + // first pass + atp->stuck_check_elapsed_time = atp->elapsed_time; + atp->stuck_check_fraction_done = atp->fraction_done; + atp->stuck_check_cpu_time = atp->current_cpu_time; + } + // if fraction done does not change and cpu time is <10, message the user + if (atp->stuck_check_fraction_done == atp->fraction_done && + (atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) { msg_printf(atp->result->project, MSG_INFO, "Task has not made progress in last hour, consider aborting"); - // atp->abort_task(EXIT_TASK_STUCK, "Task has not made progress in last hour, begin to abort"); } - atp->stuck_fraction_done = atp->fraction_done; + if (atp->elapsed_time < atp->stuck_check_elapsed_time + STUCK_CHECK_POLL_PERIOD) continue; + atp->stuck_check_elapsed_time = atp->elapsed_time; + atp->stuck_check_fraction_done = atp->fraction_done; + atp->stuck_check_cpu_time = atp->current_cpu_time; } - last_fraction_done_check_time = gstate.now; + last_stuck_check_time = gstate.now; } if (action) { diff --git a/client/client_state.h b/client/client_state.h index ff7c34a8cbf..9876bf1b03f 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -601,7 +601,7 @@ extern THREAD throttle_thread; #define MEMORY_USAGE_PERIOD 10 // computer memory usage and check for exclusive apps this often -#define FRACTION_DONE_POLL_PERIOD 3600 +#define STUCK_CHECK_POLL_PERIOD 3600 // poll if fraction done has change within an hour to see if it is stuck //////// WORK FETCH diff --git a/lib/error_numbers.h b/lib/error_numbers.h index 5008b1e9924..d6110acc3e9 100644 --- a/lib/error_numbers.h +++ b/lib/error_numbers.h @@ -42,7 +42,6 @@ #define EXIT_INIT_FAILURE 206 #define EXIT_NO_SUB_TASKS 207 #define EXIT_SUB_TASK_FAILURE 208 -#define EXIT_TASK_STUCK 209 // Function return values. // NOTE: add new errors to the end of the list and don't change From cbd9b8ea53d101f846a1865f4511a9a3e5912e5b Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Sat, 9 Dec 2023 19:39:44 -0600 Subject: [PATCH 04/12] Fix trailing whitespace --- client/app_control.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/app_control.cpp b/client/app_control.cpp index 137df48e195..365ecadcc4b 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -166,7 +166,7 @@ bool ACTIVE_TASK_SET::poll() { atp->stuck_check_cpu_time = atp->current_cpu_time; } // if fraction done does not change and cpu time is <10, message the user - if (atp->stuck_check_fraction_done == atp->fraction_done && + if (atp->stuck_check_fraction_done == atp->fraction_done && (atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) { msg_printf(atp->result->project, MSG_INFO, "Task has not made progress in last hour, consider aborting"); } From cec23a1bab6e2c3325d3286ec8182e9e44096735 Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Sat, 9 Dec 2023 20:34:46 -0600 Subject: [PATCH 05/12] Update MSG Label --- client/app_control.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/client/app_control.cpp b/client/app_control.cpp index 365ecadcc4b..9202d37cb9a 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -168,7 +168,8 @@ bool ACTIVE_TASK_SET::poll() { // if fraction done does not change and cpu time is <10, message the user if (atp->stuck_check_fraction_done == atp->fraction_done && (atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) { - msg_printf(atp->result->project, MSG_INFO, "Task has not made progress in last hour, consider aborting"); + msg_printf(atp->result->project, MSG_USER_ALERT, + "Task has not made progress in last hour, consider aborting"); } if (atp->elapsed_time < atp->stuck_check_elapsed_time + STUCK_CHECK_POLL_PERIOD) continue; atp->stuck_check_elapsed_time = atp->elapsed_time; From 7483a350407e8e5e8333290269e18db7c1dd9d95 Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Sat, 9 Dec 2023 20:39:22 -0600 Subject: [PATCH 06/12] Update comments --- client/client_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/client_state.h b/client/client_state.h index 9876bf1b03f..61513628415 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -602,7 +602,7 @@ extern THREAD throttle_thread; // computer memory usage and check for exclusive apps this often #define STUCK_CHECK_POLL_PERIOD 3600 - // poll if fraction done has change within an hour to see if it is stuck + // poll if a job is ever stuck //////// WORK FETCH From c159e89cdcb1b821bad6e27cf63e4c948fdeb924 Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Sat, 9 Dec 2023 22:04:29 -0600 Subject: [PATCH 07/12] Fix typo --- client/app_control.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/app_control.cpp b/client/app_control.cpp index 9202d37cb9a..385a57c294c 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -157,7 +157,7 @@ bool ACTIVE_TASK_SET::poll() { if (gstate.now - last_stuck_check_time > STUCK_CHECK_POLL_PERIOD) { for (i=0; inon_cpu_intensive()) continue; + if (atp->non_cpu_intensive()) continue; if (atp->sporadic()) continue; if (atp->stuck_check_elapsed_time == 0) { // first pass From f68fa1ed9b657fca5f3dc8eb749dda805bb08495 Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Sun, 10 Dec 2023 21:35:44 -0600 Subject: [PATCH 08/12] Update order in stuck check poll --- client/app_control.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/client/app_control.cpp b/client/app_control.cpp index 385a57c294c..7face9114d2 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -164,14 +164,15 @@ bool ACTIVE_TASK_SET::poll() { atp->stuck_check_elapsed_time = atp->elapsed_time; atp->stuck_check_fraction_done = atp->fraction_done; atp->stuck_check_cpu_time = atp->current_cpu_time; + continue; } - // if fraction done does not change and cpu time is <10, message the user + if (atp->elapsed_time < atp->stuck_check_elapsed_time + STUCK_CHECK_POLL_PERIOD) continue; if (atp->stuck_check_fraction_done == atp->fraction_done && (atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) { + // if fraction done does not change and cpu time is <10, message the user msg_printf(atp->result->project, MSG_USER_ALERT, "Task has not made progress in last hour, consider aborting"); } - if (atp->elapsed_time < atp->stuck_check_elapsed_time + STUCK_CHECK_POLL_PERIOD) continue; atp->stuck_check_elapsed_time = atp->elapsed_time; atp->stuck_check_fraction_done = atp->fraction_done; atp->stuck_check_cpu_time = atp->current_cpu_time; From b657da49d78a4793591bb1f14c3253569341848a Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Sun, 10 Dec 2023 21:50:58 -0600 Subject: [PATCH 09/12] Move some code for better readablility --- client/app_control.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/app_control.cpp b/client/app_control.cpp index 7face9114d2..e5672dfb960 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -155,6 +155,7 @@ bool ACTIVE_TASK_SET::poll() { // abort after some time static double last_stuck_check_time = 0; if (gstate.now - last_stuck_check_time > STUCK_CHECK_POLL_PERIOD) { + last_stuck_check_time = gstate.now; for (i=0; inon_cpu_intensive()) continue; @@ -177,7 +178,6 @@ bool ACTIVE_TASK_SET::poll() { atp->stuck_check_fraction_done = atp->fraction_done; atp->stuck_check_cpu_time = atp->current_cpu_time; } - last_stuck_check_time = gstate.now; } if (action) { From 7d31bf5e788db7e7f38905d7d467b976a7c4c147 Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Mon, 11 Dec 2023 19:05:53 -0600 Subject: [PATCH 10/12] Add the result name into print --- client/app_control.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/app_control.cpp b/client/app_control.cpp index e5672dfb960..85930346a74 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -172,7 +172,7 @@ bool ACTIVE_TASK_SET::poll() { (atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) { // if fraction done does not change and cpu time is <10, message the user msg_printf(atp->result->project, MSG_USER_ALERT, - "Task has not made progress in last hour, consider aborting"); + "Task has not made progress in last hour, consider aborting %s", atp->result->name); } atp->stuck_check_elapsed_time = atp->elapsed_time; atp->stuck_check_fraction_done = atp->fraction_done; From 6948c6f6a4446cde21ac49be665b7afabbac0cb9 Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Mon, 11 Dec 2023 19:16:44 -0600 Subject: [PATCH 11/12] Update code to fit original code style --- client/app_control.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/client/app_control.cpp b/client/app_control.cpp index 85930346a74..02a83ebe3a1 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -172,7 +172,9 @@ bool ACTIVE_TASK_SET::poll() { (atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) { // if fraction done does not change and cpu time is <10, message the user msg_printf(atp->result->project, MSG_USER_ALERT, - "Task has not made progress in last hour, consider aborting %s", atp->result->name); + "[task] has not made progress in last hour, consider aborting task %s", + atp->result->name + ); } atp->stuck_check_elapsed_time = atp->elapsed_time; atp->stuck_check_fraction_done = atp->fraction_done; From eb07ea0e41f5ae27108dfce68d8a1ceae33fe405 Mon Sep 17 00:00:00 2001 From: Franke Tang Date: Mon, 11 Dec 2023 19:18:42 -0600 Subject: [PATCH 12/12] Fix trailing whitespace --- client/app_control.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/app_control.cpp b/client/app_control.cpp index 02a83ebe3a1..10e4fc18ed8 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -172,7 +172,7 @@ bool ACTIVE_TASK_SET::poll() { (atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) { // if fraction done does not change and cpu time is <10, message the user msg_printf(atp->result->project, MSG_USER_ALERT, - "[task] has not made progress in last hour, consider aborting task %s", + "[task] has not made progress in last hour, consider aborting task %s", atp->result->name ); }