File tree Expand file tree Collapse file tree 6 files changed +33
-27
lines changed
aws-parallelcluster-config/recipes
aws-parallelcluster-install/recipes
aws-parallelcluster-test/recipes Expand file tree Collapse file tree 6 files changed +33
-27
lines changed Original file line number Diff line number Diff line change @@ -17,8 +17,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1717- Do not strip ` - ` from compute resource name when configuring Slurm nodes.
1818- Upgrade Slurm to version 21.08.4.
1919
20- ** BUG FIXES**
21- - Fix issue that is preventing cluster names to start with ` parallelcluster- ` prefix.
2220
23213.0.2
2422------
@@ -34,6 +32,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
3432 - Libfabric: `` libfabric-1.13.2 ``
3533 - Open MPI: `` openmpi40-aws-4.1.1-2 ``
3634
35+ ** BUG FIXES**
36+ - Fix issue that is preventing cluster names to start with ` parallelcluster- ` prefix.
37+
37383.0.1
3839------
3940
Original file line number Diff line number Diff line change 187187)
188188
189189# EFA
190- default [ 'cluster' ] [ 'efa' ] [ 'installer_version' ] = '1.13.0 '
190+ default [ 'cluster' ] [ 'efa' ] [ 'installer_version' ] = '1.14.1 '
191191default [ 'cluster' ] [ 'efa' ] [ 'installer_url' ] = "https://efa-installer.amazonaws.com/aws-efa-installer-#{ node [ 'cluster' ] [ 'efa' ] [ 'installer_version' ] } .tar.gz"
192- default [ 'cluster' ] [ 'enable_efa_gdr' ] = "no"
193192default [ 'cluster' ] [ 'efa' ] [ 'unsupported_aarch64_oses' ] = %w( centos7 )
194193
195194# NICE DCV
Original file line number Diff line number Diff line change 1515# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616# limitations under the License.
1717
18- # Installation recipe must be re-executed at runtime to enable GDR
19- include_recipe "aws-parallelcluster-install::efa"
20-
2118if platform? ( 'ubuntu' ) && node [ 'cluster' ] [ 'enable_efa' ] == 'compute' && node [ 'cluster' ] [ 'node_type' ] == 'ComputeFleet'
2219 # Disabling ptrace protection is needed for EFA in order to use SHA transfer for intra-node communication.
2320 sysctl 'kernel.yama.ptrace_scope' do
Original file line number Diff line number Diff line change 1919efa_installed = efa_installed?
2020
2121if efa_installed && !::File . exist? ( efa_tarball )
22- Chef ::Log . warn ( "Existing EFA version differs from the one shipped with ParallelCluster. Skipping ParallelCluster EFA installation and configuration. enable_gdr option will be ignored. " )
22+ Chef ::Log . warn ( "Existing EFA version differs from the one shipped with ParallelCluster. Skipping ParallelCluster EFA installation and configuration." )
2323 return
2424end
2525
5050installer_options = "-y"
5151# skip efa-kmod installation on not supported platforms
5252installer_options += " -k" unless node [ 'conditions' ] [ 'efa_supported' ]
53- # enable gpudirect support
54- installer_options += " -g" if efa_gdr_enabled?
5553
5654bash "install efa" do
5755 cwd node [ 'cluster' ] [ 'sources_dir' ]
6260 ./efa_installer.sh #{ installer_options }
6361 rm -rf #{ node [ 'cluster' ] [ 'sources_dir' ] } /aws-efa-installer
6462 EFAINSTALL
65- not_if { efa_installed && ! efa_gdr_enabled? }
63+ not_if { efa_installed }
6664end
Original file line number Diff line number Diff line change @@ -222,12 +222,34 @@ module load intelmpi && mpirun --help | grep '#{node['cluster']['intelmpi']['kit
222222end
223223
224224###################
225- # EFA - GDR (GPUDirect RDMA)
225+ # EFA
226226###################
227- if node [ 'conditions' ] [ 'efa_supported' ] && efa_gdr_enabled?
228- execute 'check efa gdr installed' do
229- command "modinfo efa | grep 'gdr:\ *Y'"
230- user node [ 'cluster' ] [ 'cluster_user' ]
227+ if node [ 'conditions' ] [ 'efa_supported' ]
228+ if node [ 'cluster' ] [ 'os' ] . end_with? ( "-custom" )
229+ # only check EFA is installed because when found in the base AMI we skip installation
230+ bash 'check efa installed' do
231+ cwd Chef ::Config [ :file_cache_path ]
232+ code <<-EFA
233+ set -ex
234+ modinfo efa
235+ cat /opt/amazon/efa_installed_packages
236+ EFA
237+ end
238+ else
239+ # check EFA is installed and the version is expected
240+ bash 'check correct version of efa installed' do
241+ cwd Chef ::Config [ :file_cache_path ]
242+ code <<-EFA
243+ set -ex
244+ modinfo efa
245+ grep "EFA installer version: #{ node [ 'cluster' ] [ 'efa' ] [ 'installer_version' ] } " /opt/amazon/efa_installed_packages
246+ EFA
247+ end
248+ # GDR (GPUDirect RDMA)
249+ execute 'check efa gdr installed' do
250+ command "modinfo efa | grep 'gdr:\ *Y'"
251+ user node [ 'cluster' ] [ 'cluster_user' ]
252+ end
231253 end
232254end
233255
Original file line number Diff line number Diff line change @@ -396,17 +396,6 @@ def get_nvswitches
396396 nvswitch_check . stdout . strip . to_i
397397end
398398
399- # Check if EFA GDR is enabled (and supported) on this instance
400- def efa_gdr_enabled?
401- config_value = node [ 'cluster' ] [ 'enable_efa_gdr' ]
402- enabling_value = if node [ 'cluster' ] [ 'node_type' ] == "ComputeFleet"
403- "compute"
404- else
405- "head_node"
406- end
407- ( config_value == enabling_value || config_value == "cluster" ) && graphic_instance?
408- end
409-
410399# Alinux OSs currently not correctly supported by NFS cookbook
411400# Overwriting templates for node['nfs']['config']['server_template'] used by NFS cookbook for these OSs
412401# When running, NFS cookbook will use nfs.conf.erb templates provided in this cookbook to generate server_template
You can’t perform that action at this time.
0 commit comments