From 8ffd9ca906916303a2596ccf0cab1a9743e8ee95 Mon Sep 17 00:00:00 2001 From: James Corbett Date: Fri, 24 Jan 2025 16:02:38 -0800 Subject: [PATCH 1/2] pals: make eventlog timeout configurable Problem: sometimes 10 seconds is not enough of a timeout for reading from the eventlog if lustre is hanging. Make the timeout configurable. --- src/shell/plugins/cray_pals.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/shell/plugins/cray_pals.c b/src/shell/plugins/cray_pals.c index c716492..7af964e 100644 --- a/src/shell/plugins/cray_pals.c +++ b/src/shell/plugins/cray_pals.c @@ -490,7 +490,8 @@ static int create_apinfo (const char *apinfo_path, flux_shell_t *shell) static int read_future (flux_future_t *fut, char *buf, size_t bufsize, - json_int_t *random) + json_int_t *random, + double timeout) { json_t *o = NULL; json_t *context = NULL; @@ -501,7 +502,7 @@ static int read_future (flux_future_t *fut, json_int_t portnum; int bytes_written; - while (flux_future_wait_for (fut, 10.0) == 0 + while (flux_future_wait_for (fut, timeout) == 0 && flux_job_event_watch_get (fut, &event) == 0) { if (!(o = eventlog_entry_decode (event))) { shell_log_errno ("Error decoding eventlog entry"); @@ -576,13 +577,19 @@ static int get_pals_ports (flux_shell_t *shell, json_int_t jobid) flux_future_t *fut = NULL; int rc; json_int_t random; + double timeout = 10.0; if (!(h = flux_shell_get_flux (shell)) || !(fut = flux_job_event_watch (h, (flux_jobid_t)jobid, "eventlog", 0))) { shell_log_error ("Error creating event_watch future"); return -1; } - if ((rc = read_future (fut, buf, sizeof (buf), &random)) < 0) + if (flux_shell_getopt_unpack (shell, + "cray-pals", + "{s?F}", + "timeout", + &timeout) < 0 + || (rc = read_future (fut, buf, sizeof (buf), &random, timeout)) < 0) shell_log_error ("Error reading ports from eventlog"); flux_future_destroy (fut); @@ -802,7 +809,7 @@ int flux_plugin_init (flux_plugin_t *p) shell_debug ("enabled"); - // If -o cray-pals.no-edit-env is was speciifed set a flag for later + // If -o cray-pals.no-edit-env is was specified set a flag for later no_edit_env = 0; (void)flux_shell_getopt_unpack (shell, "cray-pals", From bffb6bfc1a31ee7ab53c5e82646390136822d34d Mon Sep 17 00:00:00 2001 From: James Corbett Date: Mon, 27 Jan 2025 15:27:40 -0800 Subject: [PATCH 2/2] cray-pals: improve eventlog error messages Problem: cray_pals.c provides no indication of whether it has received any events at all before it times out. Improve the error message to say what the last event received was, and whether any events have been received at all. --- src/shell/plugins/cray_pals.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/shell/plugins/cray_pals.c b/src/shell/plugins/cray_pals.c index 7af964e..d2aebe4 100644 --- a/src/shell/plugins/cray_pals.c +++ b/src/shell/plugins/cray_pals.c @@ -496,7 +496,7 @@ static int read_future (flux_future_t *fut, json_t *o = NULL; json_t *context = NULL; json_t *array; - const char *name, *event = NULL; + const char *name = "", *event = NULL; size_t index = 0; json_t *value; json_int_t portnum; @@ -566,7 +566,8 @@ static int read_future (flux_future_t *fut, json_decref (o); } } - shell_log_error ("Timed out waiting for start event"); + shell_log_error ("Timed out waiting for start event, last event received was %s", + name); return -1; }