Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ ignore =
W503,
# N818: exception name should be named with an Error suffix
N818
# B042: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`.
# Affected by false positive, https://github.com/PyCQA/flake8-bugbear/issues/525
B042
exclude =
.tox,
.git,
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG

This file is used to list changes made in each version of the aws-parallelcluster-node package.

3.15.0
------

**CHANGES**
- Direct users to slurm_resume log to see EC2 error codes if no instances are launched.

3.14.0
------

Expand Down
3 changes: 2 additions & 1 deletion src/slurm_plugin/clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1262,7 +1262,8 @@ def _reset_timeout_expired_compute_resources(
return
log.info(
"The following compute resources are in down state due to insufficient capacity: %s, "
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired",
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired. "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Test] Can we reflect this change into the corresponding unit test. The same thing you did for the resume script.

"Check the slurm_resume log for EC2 error codes.",
self._insufficient_capacity_compute_resources,
self._config.insufficient_capacity_timeout,
)
Expand Down
6 changes: 5 additions & 1 deletion src/slurm_plugin/resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,11 @@ def _resume(arg_nodes, resume_config, slurm_resume):
print_with_count(failed_nodes),
)
for error_code, node_list in instance_manager.failed_nodes.items():
_handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes")
_handle_failed_nodes(
node_list,
reason=f"(Code:{error_code})Failure when resuming nodes - "
f"Check the slurm_resume log for EC2 error codes",
)

event_publisher = ClusterEventPublisher.create_with_default_publisher(
event_logger,
Expand Down
6 changes: 5 additions & 1 deletion tests/slurm_plugin/test_resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,11 @@ def test_resume_launch(
if expected_failed_nodes:
for error_code, nodeset in expected_failed_nodes.items():
mock_handle_failed_nodes_calls.append(
call(nodeset, reason=f"(Code:{error_code})Failure when resuming nodes")
call(
nodeset,
reason=f"(Code:{error_code})Failure when resuming nodes - "
f"Check the slurm_resume log for EC2 error codes",
)
)
mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls)
mock_terminate_instances.assert_called_with(ANY, mock_resume_config.terminate_max_batch_size)
Expand Down