Use subprocess based method to read shared files

Rex · rexcsn · commit 22c7e21033d8 · 2021-06-29T15:14:46.000-07:00
* Avoid reading NFS shared files with `open` directly, because when NFS is not available(head node down) `open` call will hang forever.
* Use `subprocess.run`, which has built in timeout functionality, to copy shared file to local first, or cat file directly
* This will prevent computemgtd from hanging even if NFS directories are not available

Signed-off-by: Rex &lt;shuningc@amazon.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ This file is used to list changes made in each version of the aws-parallelcluste
 - SGE: always use shortname as hostname filter with `qstat`. This will make nodewatcher more robust when using custom DHCP option, where the full hostname seen by `SGE` might differ from the hostname returned from EC2 metadata(local-hostname).
 - Transition from IMDSv1 to IMDSv2.
 - Have `computemgtd` reuse last available daemon configuration when the new one cannot be loaded.
+- Use methods with timeouts to read NFS shared files, which will prevent `computemgtd` from hanging when NFS filesystems are not available.
 
 **BUG FIXES**
 - Fix a bug that caused `clustermgtd` to not immediately replace instances with failed status check that are in replacement process.
diff --git a/src/slurm_plugin/common.py b/src/slurm_plugin/common.py
@@ -21,7 +21,7 @@
 from botocore.config import Config
 from botocore.exceptions import ClientError
 from common.schedulers.slurm_commands import InvalidNodenameError, parse_nodename, update_nodes
-from common.utils import grouper
+from common.utils import check_command_output, grouper
 
 CONFIG_FILE_DIR = "/etc/parallelcluster/slurm_plugin"
 EC2Instance = collections.namedtuple("EC2Instance", ["id", "private_ip", "hostname", "launch_time"])
@@ -49,6 +49,7 @@
 # timestamp used by clustermgtd and computemgtd should be in default ISO format
 # YYYY-MM-DDTHH:MM:SS.ffffff+HH:MM[:SS[.ffffff]]
 TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f%z"
+DEFAULT_COMMAND_TIMEOUT = 30
 
 
 logger = logging.getLogger(__name__)
@@ -430,16 +431,27 @@ def retrieve_instance_type_mapping(file_path):
         raise
 
 
-def _get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path):
+def get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path):
     """Get clustermgtd's last heartbeat."""
-    with open(clustermgtd_heartbeat_file_path, "r") as timestamp_file:
-        # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
-        # datetime.strptime will not work with str(datetime)
-        # Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
-        return datetime.strptime(timestamp_file.read().strip(), TIMESTAMP_FORMAT)
+    # Use subprocess based method to read shared file to prevent hanging when NFS is down
+    # Do not copy to local. Different users need to access the file, but file should be writable by root only
+    # Only use last line of output to avoid taking unexpected output in stdout
+    heartbeat = (
+        check_command_output(
+            f"cat {clustermgtd_heartbeat_file_path}",
+            timeout=DEFAULT_COMMAND_TIMEOUT,
+            shell=True,  # nosec
+        )
+        .splitlines()[-1]
+        .strip()
+    )
+    # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
+    # datetime.strptime will not work with str(datetime)
+    # Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
+    return datetime.strptime(heartbeat, TIMESTAMP_FORMAT)
 
 
-def _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout):
+def expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout):
     """Test if clustermgtd heartbeat is expired."""
     if time_is_up(last_heartbeat, current_time, clustermgtd_timeout):
         logger.error(
@@ -454,9 +466,9 @@ def _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_tim
 
 def is_clustermgtd_heartbeat_valid(current_time, clustermgtd_timeout, clustermgtd_heartbeat_file_path):
     try:
-        last_heartbeat = _get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path)
+        last_heartbeat = get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path)
         logger.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
-        return not _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout)
+        return not expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout)
     except Exception as e:
         logger.error("Unable to retrieve clustermgtd heartbeat with exception: %s", e)
         return False
diff --git a/src/slurm_plugin/computemgtd.py b/src/slurm_plugin/computemgtd.py
@@ -21,9 +21,16 @@
 from botocore.config import Config
 from common.schedulers.slurm_commands import get_nodes_info
 from common.time_utils import seconds
-from common.utils import get_metadata, sleep_remaining_loop_time
+from common.utils import get_metadata, run_command, sleep_remaining_loop_time
 from retrying import retry
-from slurm_plugin.common import CONFIG_FILE_DIR, InstanceManager, is_clustermgtd_heartbeat_valid, log_exception
+from slurm_plugin.common import (
+    CONFIG_FILE_DIR,
+    DEFAULT_COMMAND_TIMEOUT,
+    InstanceManager,
+    expired_clustermgtd_heartbeat,
+    get_clustermgtd_heartbeat,
+    log_exception,
+)
 
 LOOP_TIME = 60
 RELOAD_CONFIG_ITERATIONS = 10
@@ -53,14 +60,20 @@ def __repr__(self):
         attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()])
         return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs)
 
-    @log_exception(log, "reading computemgtd config", catch_exception=IOError, raise_on_error=True)
+    @log_exception(log, "reading computemgtd config", catch_exception=Exception, raise_on_error=True)
     def _get_config(self, config_file_path):
         """Get computemgtd configuration."""
         log.info("Reading %s", config_file_path)
         config = ConfigParser()
         try:
-            config.read_file(open(config_file_path, "r"))
-        except IOError:
+            # Use subprocess based method to copy shared file to local to prevent hanging when NFS is down
+            run_command(
+                f"cat {config_file_path} > {CONFIG_FILE_DIR}/.computemgtd_config.local",
+                timeout=DEFAULT_COMMAND_TIMEOUT,
+                shell=True,  # nosec
+            )
+            config.read_file(open(f"{CONFIG_FILE_DIR}/.computemgtd_config.local", "r"))
+        except Exception:
             log.error(f"Cannot read computemgtd configuration file: {config_file_path}")
             raise
 
@@ -99,11 +112,10 @@ def _get_config(self, config_file_path):
     def _read_nodename_from_file(nodename_file_path):
         """Read self nodename from a file."""
         try:
-            log.info("Reading self nodename from %s", nodename_file_path)
             with open(nodename_file_path, "r") as nodename_file:
                 nodename = nodename_file.read()
             return nodename
-        except IOError as e:
+        except Exception as e:
             log.error("Unable to read self nodename from %s with exception: %s\n", nodename_file_path, e)
             raise
 
@@ -187,9 +199,16 @@ def _run_computemgtd():
             reload_config_counter -= 1
 
         # Check heartbeat
-        if not is_clustermgtd_heartbeat_valid(
-            current_time, computemgtd_config.clustermgtd_timeout, computemgtd_config.clustermgtd_heartbeat_file_path
-        ):
+        try:
+            last_heartbeat = get_clustermgtd_heartbeat(computemgtd_config.clustermgtd_heartbeat_file_path)
+            log.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
+        except Exception as e:
+            log.warning(
+                "Unable to retrieve clustermgtd heartbeat. Using last known heartbeat: %s with exception: %s",
+                last_heartbeat,
+                e,
+            )
+        if expired_clustermgtd_heartbeat(last_heartbeat, current_time, computemgtd_config.clustermgtd_timeout):
             if computemgtd_config.disable_computemgtd_actions:
                 log.info("All computemgtd actions currently disabled")
             elif _is_self_node_down(computemgtd_config.nodename):
diff --git a/tests/slurm_plugin/test_common.py b/tests/slurm_plugin/test_common.py
@@ -13,7 +13,7 @@
 import logging
 import os
 from datetime import datetime, timedelta, timezone
-from unittest.mock import call, mock_open
+from unittest.mock import call
 
 import botocore
 import pytest
@@ -26,7 +26,7 @@
     EC2Instance,
     EC2InstanceHealthState,
     InstanceManager,
-    _get_clustermgtd_heartbeat,
+    get_clustermgtd_heartbeat,
     time_is_up,
 )
 
@@ -1363,5 +1363,8 @@ def test_time_is_up(initial_time, current_time, grace_time, expected_result):
     ],
 )
 def test_get_clustermgtd_heartbeat(time, expected_parsed_time, mocker):
-    mocker.patch("slurm_plugin.common.open", mock_open(read_data=time.strftime(TIMESTAMP_FORMAT)))
-    assert_that(_get_clustermgtd_heartbeat("some file path")).is_equal_to(expected_parsed_time)
+    mocker.patch(
+        "slurm_plugin.common.check_command_output",
+        return_value=f"some_random_stdout\n{time.strftime(TIMESTAMP_FORMAT)}",
+    )
+    assert_that(get_clustermgtd_heartbeat("some file path")).is_equal_to(expected_parsed_time)
diff --git a/tests/slurm_plugin/test_computemgtd.py b/tests/slurm_plugin/test_computemgtd.py
@@ -61,7 +61,9 @@
 )
 def test_computemgtd_config(config_file, expected_attributes, test_datadir, mocker):
     mocker.patch("slurm_plugin.computemgtd.ComputemgtdConfig._read_nodename_from_file", return_value="some_nodename")
-    compute_config = ComputemgtdConfig(test_datadir / config_file)
+    mocker.patch("slurm_plugin.computemgtd.run_command")
+    mocker.patch("slurm_plugin.computemgtd.open", return_value=open(test_datadir / config_file, "r"))
+    compute_config = ComputemgtdConfig("mocked_config_path")
     for key in expected_attributes:
         assert_that(compute_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key))