|
21 | 21 | from botocore.config import Config |
22 | 22 | from common.schedulers.slurm_commands import get_nodes_info |
23 | 23 | from common.time_utils import seconds |
24 | | -from common.utils import get_metadata, sleep_remaining_loop_time |
| 24 | +from common.utils import get_metadata, run_command, sleep_remaining_loop_time |
25 | 25 | from retrying import retry |
26 | | -from slurm_plugin.common import CONFIG_FILE_DIR, InstanceManager, is_clustermgtd_heartbeat_valid, log_exception |
| 26 | +from slurm_plugin.common import ( |
| 27 | + CONFIG_FILE_DIR, |
| 28 | + DEFAULT_COMMAND_TIMEOUT, |
| 29 | + InstanceManager, |
| 30 | + expired_clustermgtd_heartbeat, |
| 31 | + get_clustermgtd_heartbeat, |
| 32 | + log_exception, |
| 33 | +) |
27 | 34 |
|
28 | 35 | LOOP_TIME = 60 |
29 | 36 | RELOAD_CONFIG_ITERATIONS = 10 |
@@ -53,14 +60,20 @@ def __repr__(self): |
53 | 60 | attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()]) |
54 | 61 | return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs) |
55 | 62 |
|
56 | | - @log_exception(log, "reading computemgtd config", catch_exception=IOError, raise_on_error=True) |
| 63 | + @log_exception(log, "reading computemgtd config", catch_exception=Exception, raise_on_error=True) |
57 | 64 | def _get_config(self, config_file_path): |
58 | 65 | """Get computemgtd configuration.""" |
59 | 66 | log.info("Reading %s", config_file_path) |
60 | 67 | config = ConfigParser() |
61 | 68 | try: |
62 | | - config.read_file(open(config_file_path, "r")) |
63 | | - except IOError: |
| 69 | + # Use subprocess based method to copy shared file to local to prevent hanging when NFS is down |
| 70 | + run_command( |
| 71 | + f"cat {config_file_path} > {CONFIG_FILE_DIR}/.computemgtd_config.local", |
| 72 | + timeout=DEFAULT_COMMAND_TIMEOUT, |
| 73 | + shell=True, # nosec |
| 74 | + ) |
| 75 | + config.read_file(open(f"{CONFIG_FILE_DIR}/.computemgtd_config.local", "r")) |
| 76 | + except Exception: |
64 | 77 | log.error(f"Cannot read computemgtd configuration file: {config_file_path}") |
65 | 78 | raise |
66 | 79 |
|
@@ -99,11 +112,10 @@ def _get_config(self, config_file_path): |
99 | 112 | def _read_nodename_from_file(nodename_file_path): |
100 | 113 | """Read self nodename from a file.""" |
101 | 114 | try: |
102 | | - log.info("Reading self nodename from %s", nodename_file_path) |
103 | 115 | with open(nodename_file_path, "r") as nodename_file: |
104 | 116 | nodename = nodename_file.read() |
105 | 117 | return nodename |
106 | | - except IOError as e: |
| 118 | + except Exception as e: |
107 | 119 | log.error("Unable to read self nodename from %s with exception: %s\n", nodename_file_path, e) |
108 | 120 | raise |
109 | 121 |
|
@@ -187,9 +199,16 @@ def _run_computemgtd(): |
187 | 199 | reload_config_counter -= 1 |
188 | 200 |
|
189 | 201 | # Check heartbeat |
190 | | - if not is_clustermgtd_heartbeat_valid( |
191 | | - current_time, computemgtd_config.clustermgtd_timeout, computemgtd_config.clustermgtd_heartbeat_file_path |
192 | | - ): |
| 202 | + try: |
| 203 | + last_heartbeat = get_clustermgtd_heartbeat(computemgtd_config.clustermgtd_heartbeat_file_path) |
| 204 | + log.info("Latest heartbeat from clustermgtd: %s", last_heartbeat) |
| 205 | + except Exception as e: |
| 206 | + log.warning( |
| 207 | + "Unable to retrieve clustermgtd heartbeat. Using last known heartbeat: %s with exception: %s", |
| 208 | + last_heartbeat, |
| 209 | + e, |
| 210 | + ) |
| 211 | + if expired_clustermgtd_heartbeat(last_heartbeat, current_time, computemgtd_config.clustermgtd_timeout): |
193 | 212 | if computemgtd_config.disable_computemgtd_actions: |
194 | 213 | log.info("All computemgtd actions currently disabled") |
195 | 214 | elif _is_self_node_down(computemgtd_config.nodename): |
|
0 commit comments