From 3e43a10939fdd52f6d66db7cff8244f47c381296 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Tue, 20 Sep 2022 12:16:01 -0700 Subject: [PATCH 1/3] Upgrade Slurm to 22.05.3 In addition to the upgrade: Remove TaskAffinity=no in cgroups.conf. This parameter had been ignored from Slurm 21. In addition, Slurm 22 fails if the parameter still exists. Signed-off-by: Hanwen --- CHANGELOG.md | 1 + attributes/default.rb | 4 ++-- .../templates/default/slurm/cgroup.conf.erb | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8864ec509e..13b3a51b3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Reduce timeout from 50 to a maximum of 5min in case of DynamoDB connection issues at compute node bootstrap. - Change the logic to number the routing tables when an instance have multiple NICs. - Upgrade Python from 3.7.13 to 3.9.13. +- Upgrade Slurm to version 22.05.3. 3.2.0 ------ diff --git a/attributes/default.rb b/attributes/default.rb index c5335dc896..14696920a9 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -124,9 +124,9 @@ # URLs to software packages used during install recipes # Slurm software default['cluster']['slurm_plugin_dir'] = '/etc/parallelcluster/slurm_plugin' -default['cluster']['slurm']['version'] = '21-08-8-2' +default['cluster']['slurm']['version'] = '22-05-3-1' default['cluster']['slurm']['url'] = "https://github.com/SchedMD/slurm/archive/slurm-#{node['cluster']['slurm']['version']}.tar.gz" -default['cluster']['slurm']['sha1'] = 'f7687c11f024fbbe5399b93906d1179adc5c3fb6' +default['cluster']['slurm']['sha1'] = 'f7340a7def5ba359327dd8ff41272b76e28d8bdf' default['cluster']['slurm']['user'] = 'slurm' default['cluster']['slurm']['user_id'] = node['cluster']['reserved_base_uid'] + 1 default['cluster']['slurm']['group'] = node['cluster']['slurm']['user'] diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/cgroup.conf.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/cgroup.conf.erb index f9c3800a72..d5ac71e626 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/cgroup.conf.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/cgroup.conf.erb @@ -2,7 +2,6 @@ # Slurm cgroup support configuration file ### CgroupAutomount=yes -TaskAffinity=no ConstrainCores=yes # # WARNING!!! The slurm_parallelcluster_cgroup.conf file included below can be updated by the pcluster process. From 73ecd7a555b938fdaf40dd8db206413112f0afcc Mon Sep 17 00:00:00 2001 From: Hanwen Date: Tue, 20 Sep 2022 12:17:19 -0700 Subject: [PATCH 2/3] Remove prolog/epilog used as a workaround for cluster without Internet connection. The Slurm bug has been fixed with Slurm 22.05. The workaround is no longer necessary Signed-off-by: Hanwen --- attributes/default.rb | 1 - .../cloudwatch_log_files.json | 22 -------- .../files/default/head_node_slurm/epilog | 39 -------------- .../files/default/head_node_slurm/prolog | 54 ------------------- .../recipes/config_head_node.rb | 28 ---------- .../templates/default/slurm/slurm.conf.erb | 13 ----- 6 files changed, 157 deletions(-) delete mode 100644 cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/epilog delete mode 100644 cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/prolog diff --git a/attributes/default.rb b/attributes/default.rb index 14696920a9..49619289c2 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -552,7 +552,6 @@ default['cluster']['raid_vol_ids'] = '' default['cluster']['dns_domain'] = nil default['cluster']['use_private_hostname'] = 'false' -default['cluster']['add_node_hostnames_in_hosts_file'] = node['cluster']['use_private_hostname'] default['cluster']['skip_install_recipes'] = 'yes' default['cluster']['enable_nss_slurm'] = node['cluster']['directory_service']['enabled'] default['cluster']['realmemory_to_ec2memory_ratio'] = 0.95 diff --git a/cookbooks/aws-parallelcluster-config/files/default/cloudwatch_agent/cloudwatch_log_files.json b/cookbooks/aws-parallelcluster-config/files/default/cloudwatch_agent/cloudwatch_log_files.json index 4fb2354ca6..710c6f4277 100644 --- a/cookbooks/aws-parallelcluster-config/files/default/cloudwatch_agent/cloudwatch_log_files.json +++ b/cookbooks/aws-parallelcluster-config/files/default/cloudwatch_agent/cloudwatch_log_files.json @@ -540,28 +540,6 @@ ], "feature_conditions": [] }, - { - "timestamp_format_key": "default", - "file_path": "/var/log/parallelcluster/slurm_prolog_epilog.log", - "log_stream_name": "slurm_prolog_epilog", - "schedulers": [ - "slurm" - ], - "platforms": [ - "centos", - "ubuntu", - "amazon" - ], - "node_roles": [ - "ComputeFleet" - ], - "feature_conditions": [ - { - "dna_key": "use_private_hostname", - "satisfying_values": ["true"] - } - ] - }, { "timestamp_format_key": "default", "file_path": "/var/log/parallelcluster/clusterstatusmgtd", diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/epilog b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/epilog deleted file mode 100644 index 50998e0448..0000000000 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/epilog +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# -# Cookbook Name:: aws-parallelcluster -# -# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. -# -# This script removes content with the following format to /etc/hosts file -# #HOSTS_JOB_ -# 192.168.1.2 queue-0-st-compute-resource-0-1 ip-192-168-1-2 -# 192.168.1.10 queue-0-st-compute-resource-0-2 ip-192-168-1-10 -# #END_JOB_ -# -# SLURM_JOB_ID is a env var provide by Slurm - -LOG_FILE_PATH="/var/log/parallelcluster/slurm_prolog_epilog.log" - -_log() { - text=$1 - level="${2:-INFO}" # If the second argument is not provided, "INFO" is the default log level - log_time=$(date "+%Y-%m-%d %H:%M:%S") - echo "${log_time} - ${level} - Job ${SLURM_JOB_ID} - ${text}" >> "${LOG_FILE_PATH}" -} - -_log "Removing nodes from /etc/hosts" -if ! sed_output=$(sed -i "/#HOSTS_JOB_${SLURM_JOB_ID}/,/#END_JOB_${SLURM_JOB_ID}/d" /etc/hosts 2>&1); then - _log "Failed to remove nodes: ${sed_output}" "ERROR" - exit 1 -fi -_log "Finished removing nodes from /etc/hosts" -exit 0 \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/prolog b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/prolog deleted file mode 100644 index 86f8c6f0c5..0000000000 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/prolog +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# -# Cookbook Name:: aws-parallelcluster -# -# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. -# -# This script writes content with the following format to /etc/hosts file -# #HOSTS_JOB_ -# 192.168.1.2 queue-0-st-compute-resource-0-1 ip-192-168-1-2 -# 192.168.1.10 queue-0-st-compute-resource-0-2 ip-192-168-1-10 -# #END_JOB_ -# This content contains the DNS information for all nodes the job that is allocated to. -# The nodes information of newer job is always inserted before nodes information of older job -# to ensure the latest DNS information is used. -# -# SlURM_NODE_ALIASES and SLURM_JOB_ID are an env vars provided by Slurm, - -LOG_FILE_PATH="/var/log/parallelcluster/slurm_prolog_epilog.log" - -_log() { - text=$1 - level="${2:-INFO}" # If the second argument is not provided, "INFO" is the default log level - log_time=$(date "+%Y-%m-%d %H:%M:%S") - echo "${log_time} - ${level} - Job ${SLURM_JOB_ID} - ${text}" >> "${LOG_FILE_PATH}" -} - -_log "Adding nodes to /etc/hosts" -# SLURM_NODE_ALIASES has value like "queue-0-dy-compute-resource-0-1:[192.168.1.2]:ip-192-168-1-2" -# The following line transforms this line to "192.168.1.2 queue-0-dy-compute-resource ip-192-168-1-2" -hosts=$(echo -n "${SLURM_NODE_ALIASES}" | awk 'BEGIN{RS=","; FS=":";ORS="\\n"}; {gsub(/\[|\]/,"",$2); print $2,$1,$3}' ) -lines='#HOSTS_JOB_'"${SLURM_JOB_ID}\n${hosts}"'#END_JOB_'"${SLURM_JOB_ID}" -if grep -q '^#HOSTS_JOB_.*' /etc/hosts -then - # If there is other nodes information in the file, the newest nodes information is inserted before the older ones. - if ! sed_output=$(sed -i '0,/^#HOSTS_JOB_.*/s//'"${lines}\n&/" /etc/hosts 2>&1); then - # If the sed command errored, log the stdout and stderr. Note that when executing the command, the stderr is redirected to stdout - _log "Failed to add nodes: ${sed_output}" "ERROR" - exit 1 - fi -else - # If there is no other nodes information in the file, the nodes information is appended to the file. - echo -e "${lines}" >> /etc/hosts -fi -_log "Finished adding nodes to /etc/hosts" -exit 0 \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config_head_node.rb index 131a5922a7..fba0c106ee 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config_head_node.rb @@ -192,34 +192,6 @@ action :create end -if node['cluster']['add_node_hostnames_in_hosts_file'] == "true" - directory "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/prolog.d" do - user 'root' - group 'root' - mode '0755' - end - - cookbook_file "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/prolog.d/01-pcluster-prolog" do - source 'head_node_slurm/prolog' - owner node['cluster']['slurm']['user'] - group node['cluster']['slurm']['group'] - mode '0744' - end - - directory "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/epilog.d" do - user 'root' - group 'root' - mode '0755' - end - - cookbook_file "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/epilog.d/01-pcluster-epilog" do - source 'head_node_slurm/epilog' - owner node['cluster']['slurm']['user'] - group node['cluster']['slurm']['group'] - mode '0744' - end -end - service "slurmctld" do supports restart: false action %i(enable start) diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/slurm.conf.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/slurm.conf.erb index 75955a5c4e..b71ebbff4f 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/slurm.conf.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/slurm.conf.erb @@ -40,19 +40,6 @@ SuspendTimeout=120 PrivateData=cloud ResumeRate=0 SuspendRate=0 -<% if node["cluster"]["add_node_hostnames_in_hosts_file"] == 'true' -%> -# -# PROLOG AND EPILOG -# prolog is executed to add nodes info to /etc/hosts on compute nodes when each job is allocated -# epilog is executed to clean contents written by prolog -# PrologFlags specifies the prolog is executed at job allocation and prologs and epilogs are of different jobs are executed serially -# SchedulerParameters allows jobs to be requeued to other nodes if prolog error exits. -# Note the error exit of prolog drains a node, because the error of prolog is considered as a node error. -Epilog=<%= node['cluster']['slurm']['install_dir'] %>/etc/pcluster/epilog.d/* -Prolog=<%= node['cluster']['slurm']['install_dir'] %>/etc/pcluster/prolog.d/* -PrologFlags=alloc,serial -SchedulerParameters=nohold_on_prolog_fail -<% end -%> # # TIMERS SlurmctldTimeout=300 From ec90185632c114b83e197ed862576b083e2759ec Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 21 Sep 2022 14:59:18 +0000 Subject: [PATCH 3/3] Update rubocop requirement from ~> 1.15.0 to ~> 1.25.0 Updates the requirements on [rubocop](https://github.com/rubocop/rubocop) to permit the latest version. - [Release notes](https://github.com/rubocop/rubocop/releases) - [Changelog](https://github.com/rubocop/rubocop/blob/master/CHANGELOG.md) - [Commits](https://github.com/rubocop/rubocop/compare/v1.15.0...v1.25.0) --- updated-dependencies: - dependency-name: rubocop dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- Gemfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index 506378c8bb..b84d0b633f 100644 --- a/Gemfile +++ b/Gemfile @@ -7,7 +7,7 @@ gem 'berkshelf' group :style do gem 'cookstyle', '~> 7.25.9' gem 'rake', '~> 13.0.1' - gem 'rubocop', '~> 1.22.3' + gem 'rubocop', '~> 1.23.0' gem 'rubocop-gitlab-security', '~> 0.1.1' end