Skip to content

Commit 5a0428d

Browse files
hanwen-clusterhanwen-pcluste
authored andcommitted
Upgrade EFA to version 1.14.1
Starting from EFA 1.14.0, GDR support is enabled by default. Therefore, this commit also removes the logic to reinstall EFA if GDR is enabled. This commit is cherry picked from 82154c9 and 9ebd58e Signed-off-by: Hanwen <[email protected]>
1 parent 445e919 commit 5a0428d

File tree

6 files changed

+33
-27
lines changed

6 files changed

+33
-27
lines changed

CHANGELOG.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1717
- Do not strip `-` from compute resource name when configuring Slurm nodes.
1818
- Upgrade Slurm to version 21.08.4.
1919

20-
**BUG FIXES**
21-
- Fix issue that is preventing cluster names to start with `parallelcluster-` prefix.
2220

2321
3.0.2
2422
------
@@ -34,6 +32,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
3432
- Libfabric: ``libfabric-1.13.2``
3533
- Open MPI: ``openmpi40-aws-4.1.1-2``
3634

35+
**BUG FIXES**
36+
- Fix issue that is preventing cluster names to start with `parallelcluster-` prefix.
37+
3738
3.0.1
3839
------
3940

attributes/default.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,8 @@
187187
)
188188

189189
# EFA
190-
default['cluster']['efa']['installer_version'] = '1.13.0'
190+
default['cluster']['efa']['installer_version'] = '1.14.1'
191191
default['cluster']['efa']['installer_url'] = "https://efa-installer.amazonaws.com/aws-efa-installer-#{node['cluster']['efa']['installer_version']}.tar.gz"
192-
default['cluster']['enable_efa_gdr'] = "no"
193192
default['cluster']['efa']['unsupported_aarch64_oses'] = %w(centos7)
194193

195194
# NICE DCV

cookbooks/aws-parallelcluster-config/recipes/efa.rb

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@
1515
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
# Installation recipe must be re-executed at runtime to enable GDR
19-
include_recipe "aws-parallelcluster-install::efa"
20-
2118
if platform?('ubuntu') && node['cluster']['enable_efa'] == 'compute' && node['cluster']['node_type'] == 'ComputeFleet'
2219
# Disabling ptrace protection is needed for EFA in order to use SHA transfer for intra-node communication.
2320
sysctl 'kernel.yama.ptrace_scope' do

cookbooks/aws-parallelcluster-install/recipes/efa.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
efa_installed = efa_installed?
2020

2121
if efa_installed && !::File.exist?(efa_tarball)
22-
Chef::Log.warn("Existing EFA version differs from the one shipped with ParallelCluster. Skipping ParallelCluster EFA installation and configuration. enable_gdr option will be ignored.")
22+
Chef::Log.warn("Existing EFA version differs from the one shipped with ParallelCluster. Skipping ParallelCluster EFA installation and configuration.")
2323
return
2424
end
2525

@@ -50,8 +50,6 @@
5050
installer_options = "-y"
5151
# skip efa-kmod installation on not supported platforms
5252
installer_options += " -k" unless node['conditions']['efa_supported']
53-
# enable gpudirect support
54-
installer_options += " -g" if efa_gdr_enabled?
5553

5654
bash "install efa" do
5755
cwd node['cluster']['sources_dir']
@@ -62,5 +60,5 @@
6260
./efa_installer.sh #{installer_options}
6361
rm -rf #{node['cluster']['sources_dir']}/aws-efa-installer
6462
EFAINSTALL
65-
not_if { efa_installed && !efa_gdr_enabled? }
63+
not_if { efa_installed }
6664
end

cookbooks/aws-parallelcluster-test/recipes/tests.rb

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -222,12 +222,34 @@ module load intelmpi && mpirun --help | grep '#{node['cluster']['intelmpi']['kit
222222
end
223223

224224
###################
225-
# EFA - GDR (GPUDirect RDMA)
225+
# EFA
226226
###################
227-
if node['conditions']['efa_supported'] && efa_gdr_enabled?
228-
execute 'check efa gdr installed' do
229-
command "modinfo efa | grep 'gdr:\ *Y'"
230-
user node['cluster']['cluster_user']
227+
if node['conditions']['efa_supported']
228+
if node['cluster']['os'].end_with?("-custom")
229+
# only check EFA is installed because when found in the base AMI we skip installation
230+
bash 'check efa installed' do
231+
cwd Chef::Config[:file_cache_path]
232+
code <<-EFA
233+
set -ex
234+
modinfo efa
235+
cat /opt/amazon/efa_installed_packages
236+
EFA
237+
end
238+
else
239+
# check EFA is installed and the version is expected
240+
bash 'check correct version of efa installed' do
241+
cwd Chef::Config[:file_cache_path]
242+
code <<-EFA
243+
set -ex
244+
modinfo efa
245+
grep "EFA installer version: #{node['cluster']['efa']['installer_version']}" /opt/amazon/efa_installed_packages
246+
EFA
247+
end
248+
# GDR (GPUDirect RDMA)
249+
execute 'check efa gdr installed' do
250+
command "modinfo efa | grep 'gdr:\ *Y'"
251+
user node['cluster']['cluster_user']
252+
end
231253
end
232254
end
233255

libraries/helpers.rb

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -396,17 +396,6 @@ def get_nvswitches
396396
nvswitch_check.stdout.strip.to_i
397397
end
398398

399-
# Check if EFA GDR is enabled (and supported) on this instance
400-
def efa_gdr_enabled?
401-
config_value = node['cluster']['enable_efa_gdr']
402-
enabling_value = if node['cluster']['node_type'] == "ComputeFleet"
403-
"compute"
404-
else
405-
"head_node"
406-
end
407-
(config_value == enabling_value || config_value == "cluster") && graphic_instance?
408-
end
409-
410399
# Alinux OSs currently not correctly supported by NFS cookbook
411400
# Overwriting templates for node['nfs']['config']['server_template'] used by NFS cookbook for these OSs
412401
# When running, NFS cookbook will use nfs.conf.erb templates provided in this cookbook to generate server_template

0 commit comments

Comments
 (0)