Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add check for stuck jobs in poll() #5451

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions client/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ ACTIVE_TASK::ACTIVE_TASK() {
fraction_done_elapsed_time = 0;
first_fraction_done = 0;
first_fraction_done_elapsed_time = 0;
stuck_check_fraction_done = 0;
stuck_check_elapsed_time = 0;
stuck_check_cpu_time = 0;
scheduler_state = CPU_SCHED_UNINITIALIZED;
next_scheduler_state = CPU_SCHED_UNINITIALIZED;
signal = 0;
Expand Down
6 changes: 6 additions & 0 deletions client/app.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ struct ACTIVE_TASK {
// first frac done reported during this run of task
double first_fraction_done_elapsed_time;
// elapsed time when the above was reported
double stuck_check_fraction_done;
// fraction done since last check for stuck
double stuck_check_elapsed_time;
// elapsed time at last stuck check
double stuck_check_cpu_time;
// cpu time at last check
SCHEDULER_STATE scheduler_state;
SCHEDULER_STATE next_scheduler_state; // temp
int signal;
Expand Down
30 changes: 30 additions & 0 deletions client/app_control.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,36 @@ bool ACTIVE_TASK_SET::poll() {
}
}
}

// check if a job is "stuck" (did not make progress in the last hour)
// notify the user about the issue
// abort after some time
static double last_stuck_check_time = 0;
if (gstate.now - last_stuck_check_time > STUCK_CHECK_POLL_PERIOD) {
for (i=0; i<active_tasks.size(); i++){
ACTIVE_TASK* atp = active_tasks[i];
if (!atp->non_cpu_intensive()) continue;
AenBleidd marked this conversation as resolved.
Show resolved Hide resolved
if (atp->sporadic()) continue;
if (atp->stuck_check_elapsed_time == 0) {
// first pass
atp->stuck_check_elapsed_time = atp->elapsed_time;
atp->stuck_check_fraction_done = atp->fraction_done;
atp->stuck_check_cpu_time = atp->current_cpu_time;
}
// if fraction done does not change and cpu time is <10, message the user
if (atp->stuck_check_fraction_done == atp->fraction_done &&
(atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) {
msg_printf(atp->result->project, MSG_USER_ALERT,
AenBleidd marked this conversation as resolved.
Show resolved Hide resolved
"Task has not made progress in last hour, consider aborting");
}
if (atp->elapsed_time < atp->stuck_check_elapsed_time + STUCK_CHECK_POLL_PERIOD) continue;
atp->stuck_check_elapsed_time = atp->elapsed_time;
atp->stuck_check_fraction_done = atp->fraction_done;
atp->stuck_check_cpu_time = atp->current_cpu_time;
}
last_stuck_check_time = gstate.now;
}

if (action) {
gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll");
}
Expand Down
3 changes: 3 additions & 0 deletions client/client_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,9 @@ extern THREAD throttle_thread;
#define MEMORY_USAGE_PERIOD 10
// computer memory usage and check for exclusive apps this often

#define STUCK_CHECK_POLL_PERIOD 3600
// poll if a job is ever stuck

//////// WORK FETCH

#define WORK_FETCH_PERIOD 60
Expand Down
Loading