From 749e6fc08c9c3a43d49d48ef8b1d8ece30908d34 Mon Sep 17 00:00:00 2001 From: Seth Boyles Date: Thu, 5 Dec 2024 15:45:22 -0800 Subject: [PATCH] Granular max-in-flight updates (#4124) * Scale action now compares number of starting/running non-routable instances to max-in-flight, instead of waiting for all instances to become routable * Scale action does not continue scaling up when any instances are 'unhealthy' (e.g. 'crashed', 'down', etc) as it's difficult to determine if unhealthy instances belong to the 'max in flight' group * Number of desired nondeploying instances now recalculated each iteration instead of decrementing by 'max-in-flight' without checking if it's in a correct state. This mitigates bugs where deployment trains (continually creating new deployments before the previous had completed) could result in number app instances exceeding the max-in-flight limit. --- .../deployment_updater/actions/scale.rb | 80 ++++- .../deployment_updater/actions/scale_spec.rb | 286 +++++++++++++++--- 2 files changed, 317 insertions(+), 49 deletions(-) diff --git a/lib/cloud_controller/deployment_updater/actions/scale.rb b/lib/cloud_controller/deployment_updater/actions/scale.rb index 84d72c4b414..a037b66a0d7 100644 --- a/lib/cloud_controller/deployment_updater/actions/scale.rb +++ b/lib/cloud_controller/deployment_updater/actions/scale.rb @@ -6,6 +6,7 @@ module VCAP::CloudController module DeploymentUpdater module Actions class Scale + HEALTHY_STATES = [VCAP::CloudController::Diego::LRP_RUNNING, VCAP::CloudController::Diego::LRP_STARTING].freeze attr_reader :deployment, :logger, :app def initialize(deployment, logger) @@ -17,14 +18,15 @@ def initialize(deployment, logger) def call deployment.db.transaction do return unless deployment.lock!.state == DeploymentModel::DEPLOYING_STATE - return unless all_instances_routable? + + return unless can_scale? || can_downscale? app.lock! + oldest_web_process_with_instances.lock! deploying_web_process.lock! deployment.update( - last_healthy_at: Time.now, state: DeploymentModel::DEPLOYING_STATE, status_value: DeploymentModel::ACTIVE_STATUS_VALUE, status_reason: DeploymentModel::DEPLOYING_STATUS_REASON @@ -36,36 +38,88 @@ def call end ScaleDownCanceledProcesses.new(deployment).call - ScaleDownOldProcess.new(deployment, oldest_web_process_with_instances, desired_old_instances).call - deploying_web_process.update(instances: desired_new_instances) + scale_down_old_processes if can_downscale? + + if can_scale? + deploying_web_process.update(instances: desired_new_instances) + deployment.update(last_healthy_at: Time.now) + end end end private - def desired_old_instances - [(oldest_web_process_with_instances.instances - deployment.max_in_flight), 0].max + def scale_down_old_processes + instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances + + return if instances_to_reduce <= 0 + + non_deploying_web_processes.each do |process| + if instances_to_reduce < process.instances + ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call + break + end + + instances_to_reduce -= process.instances + ScaleDownOldProcess.new(deployment, process, 0).call + end + end + + def can_scale? + starting_instances.count < deployment.max_in_flight && + unhealthy_instances.count == 0 && + routable_instances.count >= deploying_web_process.instances - deployment.max_in_flight + rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError + logger.info("skipping-deployment-update-for-#{deployment.guid}") + false + end + + def can_downscale? + non_deploying_web_processes.map(&:instances).sum > desired_non_deploying_instances + rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError + logger.info("skipping-deployment-update-for-#{deployment.guid}") + false + end + + def desired_non_deploying_instances + [deployment.original_web_process_instance_count - routable_instances.count, 0].max end def desired_new_instances - [deploying_web_process.instances + deployment.max_in_flight, deployment.original_web_process_instance_count].min + [routable_instances.count + deployment.max_in_flight, deployment.original_web_process_instance_count].min end def oldest_web_process_with_instances @oldest_web_process_with_instances ||= app.web_processes.select { |process| process.instances > 0 }.min_by { |p| [p.created_at, p.id] } end + def non_deploying_web_processes + app.web_processes.reject { |process| process.guid == deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] } + end + def deploying_web_process @deploying_web_process ||= deployment.deploying_web_process end - def all_instances_routable? - instances = instance_reporters.all_instances_for_app(deployment.deploying_web_process) - instances.all? { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] } - rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError - logger.info("skipping-deployment-update-for-#{deployment.guid}") - false + def starting_instances + healthy_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] } + end + + def routable_instances + reported_instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] } + end + + def healthy_instances + reported_instances.select { |_, val| HEALTHY_STATES.include?(val[:state]) } + end + + def unhealthy_instances + reported_instances.reject { |_, val| HEALTHY_STATES.include?(val[:state]) } + end + + def reported_instances + @reported_instances = instance_reporters.all_instances_for_app(deploying_web_process) end def instance_reporters diff --git a/spec/unit/lib/cloud_controller/deployment_updater/actions/scale_spec.rb b/spec/unit/lib/cloud_controller/deployment_updater/actions/scale_spec.rb index e8d41b9deb8..b94c81ea437 100644 --- a/spec/unit/lib/cloud_controller/deployment_updater/actions/scale_spec.rb +++ b/spec/unit/lib/cloud_controller/deployment_updater/actions/scale_spec.rb @@ -31,7 +31,7 @@ module VCAP::CloudController let!(:deploying_route_mapping) { RouteMappingModel.make(app: web_process.app, process_type: deploying_web_process.type) } let(:space) { web_process.space } let(:original_web_process_instance_count) { 6 } - let(:current_web_instances) { 2 } + let(:current_web_instances) { 6 } let(:current_deploying_instances) { 0 } let(:state) { DeploymentModel::DEPLOYING_STATE } @@ -42,17 +42,19 @@ module VCAP::CloudController deploying_web_process: deploying_web_process, state: state, original_web_process_instance_count: original_web_process_instance_count, - max_in_flight: 1 + max_in_flight: max_in_flight ) end + let(:max_in_flight) { 1 } + let(:diego_instances_reporter) { instance_double(Diego::InstancesReporter) } let(:all_instances_results) do - { - 0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, - 1 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, - 2 => { state: 'RUNNING', uptime: 50, since: 2, routable: true } - } + instances = {} + current_deploying_instances.times do |i| + instances[i] = { state: 'RUNNING', uptime: 50, since: 2, routable: true } + end + instances end let(:instances_reporters) { double(:instance_reporters) } let(:logger) { instance_double(Steno::Logger, info: nil, error: nil) } @@ -68,13 +70,17 @@ module VCAP::CloudController expect(deployment).to have_received(:lock!) end - it 'scales the old web process down by one after the first iteration' do - expect(original_web_process_instance_count).to be > current_deploying_instances - expect do - subject.call - end.to change { - web_process.reload.instances - }.by(-1) + context 'after a new instance has been brought up' do + let(:current_deploying_instances) { 1 } + + it 'scales the old web process down by one' do + expect(original_web_process_instance_count).to be > current_deploying_instances + expect do + subject.call + end.to change { + web_process.reload.instances + }.by(-1) + end end it 'scales up the new web process by one' do @@ -104,12 +110,12 @@ module VCAP::CloudController }.by(2) end - it 'scales the old web process down by two after the first iteration' do + it 'doesnt scale down the old web process (there are no new routable instances yet)' do expect do subject.call - end.to change { + end.not_to(change do web_process.reload.instances - }.by(-2) + end) end end @@ -125,6 +131,8 @@ module VCAP::CloudController ) end + let(:current_web_instances) { 1 } + it 'scales up the new web process by the maximum number' do expect do subject.call @@ -133,12 +141,12 @@ module VCAP::CloudController }.by(1) end - it 'scales the old web process down to 0' do + it 'doesnt scale down the old web process (there are no new routable instances yet)' do expect do subject.call - end.to change { + end.not_to(change do web_process.reload.instances - }.to(0) + end) end end @@ -161,12 +169,12 @@ module VCAP::CloudController }.by(original_web_process_instance_count) end - it 'scales the old web process down to 0' do + it 'doesnt scale down the old web process (there is no new routable instance yet)' do expect do subject.call - end.to change { + end.not_to(change do web_process.reload.instances - }.to(0) + end) end end @@ -216,6 +224,17 @@ module VCAP::CloudController context 'when the (oldest) web process will be at zero instances and is type web' do let(:current_web_instances) { 1 } let(:current_deploying_instances) { 3 } + let(:original_web_process_instance_count) { 6 } + + let!(:interim_deploying_web_process) do + ProcessModel.make( + app: web_process.app, + created_at: an_hour_ago, + type: ProcessTypes::WEB, + instances: 3, + guid: 'guid-interim' + ) + end it 'does not destroy the web process, but scales it to 0' do subject.call @@ -266,6 +285,14 @@ module VCAP::CloudController type: ProcessTypes::WEB ) end + let!(:other_web_process_with_instances) do + ProcessModel.make( + instances: 10, + app: app, + created_at: 1.hour.ago, + type: ProcessTypes::WEB + ) + end it 'destroys the oldest web process and ignores the original web process' do expect do @@ -275,8 +302,187 @@ module VCAP::CloudController end end - context 'when one of the deploying_web_process instances is starting' do + context 'when there are more instances in interim processes than there should be' do + let(:current_deploying_instances) { 10 } + let(:original_web_process_instance_count) { 20 } + let(:max_in_flight) { 4 } + + let!(:web_process) do + ProcessModel.make( + guid: 'web_process', + instances: 10, + app: app, + created_at: a_day_ago - 11, + type: ProcessTypes::WEB + ) + end + let!(:oldest_web_process_with_instances) do + ProcessModel.make( + guid: 'oldest_web_process_with_instances', + instances: 10, + app: app, + created_at: a_day_ago - 10, + type: ProcessTypes::WEB + ) + end + let!(:other_web_process_with_instances) do + ProcessModel.make( + instances: 10, + app: app, + created_at: a_day_ago - 9, + type: ProcessTypes::WEB + ) + end + + it 'scales down interim proceses so all instances equal original instance count + max in flight' do + non_deploying_instance_count = app.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum + expect(non_deploying_instance_count).to eq 30 + expect(deploying_web_process.instances).to eq 10 + + subject.call + + non_deploying_instance_count = app.reload.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum + expect(non_deploying_instance_count).to eq 10 + expect(deploying_web_process.reload.instances).to eq 14 + end + + it 'destroys interim processes that have been scaled down' do + subject.call + expect(ProcessModel.find(guid: web_process.guid)).to be_present + expect(ProcessModel.find(guid: oldest_web_process_with_instances.guid)).to be_nil + expect(ProcessModel.find(guid: other_web_process_with_instances.guid)).to be_present + end + + context 'when all in-flight instances are not up' do + let(:all_instances_results) do + { + 0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, + 1 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, + 2 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, + 3 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, + 4 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, + 5 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, + 6 => { state: 'STARTING', uptime: 50, since: 2, routable: true }, + 7 => { state: 'STARTING', uptime: 50, since: 2, routable: true }, + 8 => { state: 'DOWN', uptime: 0, since: 2, routable: false }, + 9 => { state: 'FAILING', uptime: 50, since: 2, routable: false } + } + end + + it 'scales down interim proceses so all instances equal original instance count + max in flight' do + non_deploying_instance_count = app.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum + expect(non_deploying_instance_count).to eq 30 + expect(deploying_web_process.instances).to eq 10 + + subject.call + + non_deploying_instance_count = app.reload.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum + expect(non_deploying_instance_count).to eq 14 + expect(deploying_web_process.reload.instances).to eq 10 + end + end + end + + context 'when there are fewer instances in interim processes than there should be' do + let(:current_deploying_instances) { 10 } + let(:original_web_process_instance_count) { 20 } + let(:max_in_flight) { 4 } + + let!(:web_process) do + ProcessModel.make( + guid: 'web_process', + instances: 1, + app: app, + created_at: a_day_ago - 11, + type: ProcessTypes::WEB + ) + end + let!(:oldest_web_process_with_instances) do + ProcessModel.make( + guid: 'oldest_web_process_with_instances', + instances: 1, + app: app, + created_at: a_day_ago - 10, + type: ProcessTypes::WEB + ) + end + let!(:other_web_process_with_instances) do + ProcessModel.make( + instances: 1, + app: app, + created_at: a_day_ago - 10, + type: ProcessTypes::WEB + ) + end + + it 'doesnt try to scale up or down the iterim processes' do + non_deploying_instance_count = app.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum + expect(non_deploying_instance_count).to eq 3 + expect(deploying_web_process.reload.instances).to eq 10 + + subject.call + + non_deploying_instance_count = app.web_processes.reject { |p| p.guid == deploying_web_process.guid }.map(&:instances).sum + expect(non_deploying_instance_count).to eq 3 + expect(deploying_web_process.reload.instances).to eq 14 + end + end + + context 'when some instances are missing' do + let(:current_web_instances) { 10 } + let(:original_web_process_instance_count) { 10 } + + let(:current_deploying_instances) { 9 } + let(:max_in_flight) { 5 } + + let(:all_instances_results) do + { + 0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true } + } + end + + it 'doesn\'t upscale' do + expect(web_process.instances).to eq 10 + expect(deploying_web_process.instances).to eq 9 + + subject.call + + expect(web_process.reload.instances).to eq 9 + expect(deploying_web_process.reload.instances).to eq 9 + end + end + + context 'when some, but not all, instances have finished' do + let(:current_web_instances) { 10 } + let(:original_web_process_instance_count) { 10 } + + let(:current_deploying_instances) { 5 } + let(:max_in_flight) { 5 } + + let(:all_instances_results) do + { + 0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, + 1 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, + 2 => { state: 'STARTING', uptime: 50, since: 2, routable: true }, + 3 => { state: 'RUNNING', uptime: 50, since: 2, routable: false }, + 4 => { state: 'STARTING', uptime: 50, since: 2, routable: false } + } + end + + it 'scales more instances to match max_in_flight' do + expect(web_process.instances).to eq 10 + expect(deploying_web_process.instances).to eq 5 + + subject.call + + expect(web_process.reload.instances).to eq 8 + expect(deploying_web_process.reload.instances).to eq 7 + end + end + + context 'when greater than or equal to max_in_flight of the deploying_web_process instances is starting' do let(:current_deploying_instances) { 3 } + let(:max_in_flight) { 2 } let(:all_instances_results) do { 0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, @@ -285,13 +491,15 @@ module VCAP::CloudController } end - it 'does not scales the process' do + it 'downscales the original process' do expect do subject.call - end.not_to(change do + end.to(change do web_process.reload.instances - end) + end.from(6).to(5)) + end + it 'does not scale the deploying web process' do expect do subject.call end.not_to(change do @@ -300,8 +508,9 @@ module VCAP::CloudController end end - context 'when one of the deploying_web_process instances is not routable' do + context 'when greater than or equal to max_in_flight of the deploying_web_process instances is not routable' do let(:current_deploying_instances) { 3 } + let(:max_in_flight) { 2 } let(:all_instances_results) do { 0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, @@ -310,13 +519,15 @@ module VCAP::CloudController } end - it 'does not scales the process' do + it 'downscales the original process' do expect do subject.call - end.not_to(change do + end.to(change do web_process.reload.instances - end) + end.from(6).to(5)) + end + it 'does not scale the deploying web process' do expect do subject.call end.not_to(change do @@ -325,23 +536,26 @@ module VCAP::CloudController end end - context 'when one of the deploying_web_process instances is failing' do + context 'when greater than or equal to max_in_flight of the deploying_web_process instances is failing' do let(:current_deploying_instances) { 3 } + let(:max_in_flight) { 2 } let(:all_instances_results) do { 0 => { state: 'RUNNING', uptime: 50, since: 2, routable: true }, 1 => { state: 'FAILING', uptime: 50, since: 2, routable: true }, - 2 => { state: 'FAILING', uptime: 50, since: 2, routable: true } + 2 => { state: 'CRASHED', uptime: 50, since: 2, routable: true } } end - it 'does not scale the process' do + it 'downscales the original process' do expect do subject.call - end.not_to(change do + end.to(change do web_process.reload.instances - end) + end.from(6).to(5)) + end + it 'does not scale the deploying web process' do expect do subject.call end.not_to(change do