diff --git a/CHANGELOG.md b/CHANGELOG.md index 9029793616..259e4b00eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste 3.15.0 ------ +**CHANGES** +- Mitigate the risk of transient build-image failures in RHEL and Rocky caused by out-of-sync repo mirrors, +by refreshing the local cache at every failed attempt. 3.14.1 ------ diff --git a/cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_redhat8.rb index e408bf32f8..b7d5dabdcc 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_redhat8.rb @@ -17,7 +17,7 @@ end use 'partial/_install_packages_common.rb' -use 'partial/_install_packages_rhel_amazon.rb' +use 'partial/_install_packages_rhel_rocky.rb' def default_packages # environment-modules required by EFA, Intel MPI and ARM PL diff --git a/cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_rocky8.rb index acad9d3317..375d43560c 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/install_packages/install_packages_rocky8.rb @@ -17,7 +17,7 @@ end use 'partial/_install_packages_common.rb' -use 'partial/_install_packages_rhel_amazon.rb' +use 'partial/_install_packages_rhel_rocky.rb' def default_packages # environment-modules required by EFA, Intel MPI and ARM PL diff --git a/cookbooks/aws-parallelcluster-platform/resources/install_packages/partial/_install_packages_rhel_rocky.rb b/cookbooks/aws-parallelcluster-platform/resources/install_packages/partial/_install_packages_rhel_rocky.rb new file mode 100644 index 0000000000..7473c78f50 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/install_packages/partial/_install_packages_rhel_rocky.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +action :install do + ruby_block 'install_packages_with_metadata_refresh' do + block do + max_retries = 10 + retry_delay = 5 + packages = Array(new_resource.packages).join(' ') + + max_retries.times do |attempt| + # Refresh metadata on each attempt to handle mirror inconsistency + shell_out!('dnf clean metadata && dnf makecache', timeout: 300) + + result = shell_out("dnf install -y #{packages}", timeout: 600) + break if result.exitstatus == 0 + + Chef::Log.warn("Package install attempt #{attempt + 1}/#{max_retries} failed: #{result.stderr}") + raise "Package installation failed after #{max_retries} attempts" if attempt == max_retries - 1 + + sleep retry_delay + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/install_packages_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/install_packages_spec.rb index da87f3d3e0..6a498d36b9 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/install_packages_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/install_packages_spec.rb @@ -32,11 +32,17 @@ def self.setup(chef_run) end if %w(amazon centos redhat rocky).include?(platform) - it 'installs default packages' do - is_expected.to install_package(default_packages) - .with(retries: 10) - .with(retry_delay: 5) - .with(flush_cache: { before: true }) + if platform == 'amazon' + it 'installs default packages' do + is_expected.to install_package(default_packages) + .with(retries: 10) + .with(retry_delay: 5) + .with(flush_cache: { before: true }) + end + else + it 'installs default packages with metadata refresh' do + is_expected.to run_ruby_block('install_packages_with_metadata_refresh') + end end if platform == 'amazon' && version == '2' diff --git a/cookbooks/aws-parallelcluster-shared/resources/package_repos/package_repos_redhat8.rb b/cookbooks/aws-parallelcluster-shared/resources/package_repos/package_repos_redhat8.rb index c2fe584ca4..4d76e7b15a 100644 --- a/cookbooks/aws-parallelcluster-shared/resources/package_repos/package_repos_redhat8.rb +++ b/cookbooks/aws-parallelcluster-shared/resources/package_repos/package_repos_redhat8.rb @@ -41,6 +41,11 @@ execute 'yum-config-manager_skip_if_unavail' do command "yum-config-manager --setopt=\*.skip_if_unavailable=1 --save" end + + # Reduce metadata cache time to mitigate mirror inconsistency issues + execute 'yum-config-manager_metadata_expire' do + command "yum-config-manager --setopt=\*.metadata_expire=300 --save" + end end action :update do diff --git a/cookbooks/aws-parallelcluster-shared/resources/package_repos/package_repos_rocky8.rb b/cookbooks/aws-parallelcluster-shared/resources/package_repos/package_repos_rocky8.rb index c816a3a7e7..64e1f64a23 100644 --- a/cookbooks/aws-parallelcluster-shared/resources/package_repos/package_repos_rocky8.rb +++ b/cookbooks/aws-parallelcluster-shared/resources/package_repos/package_repos_rocky8.rb @@ -48,6 +48,11 @@ def powertool_name execute 'yum-config-manager_skip_if_unavail' do command "yum-config-manager --setopt=\*.skip_if_unavailable=1 --save" end + + # Reduce metadata cache time to mitigate mirror inconsistency issues + execute 'yum-config-manager_metadata_expire' do + command "yum-config-manager --setopt=\*.metadata_expire=300 --save" + end end action :update do diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/package_repos_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/package_repos_spec.rb index 1758192042..988fe9788c 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/package_repos_spec.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/package_repos_spec.rb @@ -63,6 +63,11 @@ def self.setup(chef_run) .with(command: 'yum-config-manager --setopt=*.skip_if_unavailable=1 --save') end + it 'sets metadata expire time' do + is_expected.to run_execute('yum-config-manager_metadata_expire') + .with(command: 'yum-config-manager --setopt=*.metadata_expire=300 --save') + end + it 'enables rhui' do is_expected.to run_execute('yum-config-manager-rhel') .with(command: "yum-config-manager --enable codeready-builder-for-rhel-#{version.to_i}-rhui-rpms") @@ -102,6 +107,11 @@ def self.setup(chef_run) .with(command: 'yum-config-manager --setopt=*.skip_if_unavailable=1 --save') end + it 'sets metadata expire time' do + is_expected.to run_execute('yum-config-manager_metadata_expire') + .with(command: 'yum-config-manager --setopt=*.metadata_expire=300 --save') + end + else pending "Implement for #{platform}" end