Skip to content

Commit f1ea2c7

Browse files
committed
Remove the healthcheck step
To speed up deployments, we'll remove the healthcheck step. This adds some risk to deployments for non-web roles - if they don't have a Docker healthcheck configured then the only check we do is if the container is running. If there is a bad image we might see the container running before it exits and deploy it. Previously the healthcheck step would have avoided this by ensuring a web container could boot and serve traffic first. To mitigate this, we'll add a web barrier. Non web containers will wait before shutting down the old containers until at least one web container has passed its healthcheck. It the web container fails its healthcheck, we'll close the barrier and shut down the new containers on the non-web roles. We also have a new integration test to check we correctly handle a a broken image. This highlighted that SSHKit's default runner will stop at the first error it encounters. We'll now have a custom runner that waits for all threads to finish allowing them to clean up. Finally, we only tag an image as the latest after we have successfully started the container and passed the web barrier, if applicable. That means that if we have a deployment that completes on some hosts but not others we can run `kamal app version --quiet` to see which version is running on each host.
1 parent 3628eca commit f1ea2c7

File tree

24 files changed

+269
-332
lines changed

24 files changed

+269
-332
lines changed

lib/kamal/cli/app.rb

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,19 @@ def boot
99

1010
# Assets are prepared in a separate step to ensure they are on all hosts before booting
1111
on(KAMAL.hosts) do
12+
execute *KAMAL.auditor.record("Tagging #{KAMAL.config.absolute_image} as the latest image"), verbosity: :debug
13+
1214
KAMAL.roles_on(host).each do |role|
1315
Kamal::Cli::App::PrepareAssets.new(host, role, self).run
1416
end
1517
end
1618

19+
web_barrier = Kamal::Cli::Healthcheck::Barrier.new if web_and_non_web_roles?
20+
1721
on(KAMAL.hosts, **KAMAL.boot_strategy) do |host|
22+
# Ensure web roles are booted first to allow the web barrier to be opened
1823
KAMAL.roles_on(host).each do |role|
19-
Kamal::Cli::App::Boot.new(host, role, version, self).run
24+
Kamal::Cli::App::Boot.new(host, role, version, web_barrier, self).run
2025
end
2126
end
2227

@@ -282,4 +287,8 @@ def current_running_version(host: KAMAL.primary_host)
282287
def version_or_latest
283288
options[:version] || KAMAL.config.latest_tag
284289
end
290+
291+
def web_and_non_web_roles?
292+
KAMAL.roles.any?(&:running_traefik?) && !KAMAL.roles.all?(&:running_traefik?)
293+
end
285294
end

lib/kamal/cli/app/boot.rb

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
class Kamal::Cli::App::Boot
2-
attr_reader :host, :role, :version, :sshkit
2+
attr_reader :host, :role, :version, :web_barrier, :sshkit
33
delegate :execute, :capture_with_info, :info, to: :sshkit
4-
delegate :uses_cord?, :assets?, to: :role
4+
delegate :uses_cord?, :assets?, :running_traefik?, to: :role
55

6-
def initialize(host, role, version, sshkit)
6+
def initialize(host, role, version, web_barrier, sshkit)
77
@host = host
88
@role = role
99
@version = version
10+
@web_barrier = web_barrier
1011
@sshkit = sshkit
1112
end
1213

@@ -21,18 +22,6 @@ def run
2122
end
2223

2324
private
24-
def app
25-
@app ||= KAMAL.app(role: role)
26-
end
27-
28-
def auditor
29-
@auditor = KAMAL.auditor(role: role)
30-
end
31-
32-
def audit(message)
33-
execute *auditor.record(message), verbosity: :debug
34-
end
35-
3625
def old_version_renamed_if_clashing
3726
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
3827
renamed_version = "#{version}_replaced_#{SecureRandom.hex(8)}"
@@ -46,9 +35,18 @@ def old_version_renamed_if_clashing
4635

4736
def start_new_version
4837
audit "Booted app version #{version}"
38+
4939
execute *app.tie_cord(role.cord_host_file) if uses_cord?
5040
execute *app.run(hostname: "#{host}-#{SecureRandom.hex(6)}")
41+
5142
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
43+
44+
reach_web_barrier
45+
rescue => e
46+
close_web_barrier if running_traefik?
47+
execute *app.stop(version: version), raise_on_non_zero_exit: false
48+
49+
raise
5250
end
5351

5452
def stop_old_version(version)
@@ -64,4 +62,41 @@ def stop_old_version(version)
6462

6563
execute *app.clean_up_assets if assets?
6664
end
65+
66+
def reach_web_barrier
67+
if web_barrier
68+
if running_traefik?
69+
web_barrier.open
70+
else
71+
wait_for_web_barrier
72+
end
73+
end
74+
end
75+
76+
def wait_for_web_barrier
77+
info "Waiting at web barrier (#{host})..."
78+
web_barrier.wait
79+
info "Barrier opened (#{host})"
80+
rescue Kamal::Cli::Healthcheck::Error
81+
info "Barrier closed, shutting down new container... (#{host})"
82+
raise
83+
end
84+
85+
def close_web_barrier
86+
if web_barrier
87+
web_barrier.close
88+
end
89+
end
90+
91+
def app
92+
@app ||= KAMAL.app(role: role)
93+
end
94+
95+
def auditor
96+
@auditor = KAMAL.auditor(role: role)
97+
end
98+
99+
def audit(message)
100+
execute *auditor.record(message), verbosity: :debug
101+
end
67102
end

lib/kamal/cli/healthcheck.rb

Lines changed: 0 additions & 21 deletions
This file was deleted.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
class Kamal::Cli::Healthcheck::Barrier
2+
def initialize
3+
@ivar = Concurrent::IVar.new
4+
end
5+
6+
def close
7+
set(false)
8+
end
9+
10+
def open
11+
set(true)
12+
end
13+
14+
def wait
15+
unless opened?
16+
raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier")
17+
end
18+
end
19+
20+
private
21+
def opened?
22+
@ivar.value
23+
end
24+
25+
def set(value)
26+
@ivar.set(value)
27+
rescue Concurrent::MultipleAssignmentError
28+
end
29+
end

lib/kamal/cli/healthcheck/error.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class Kamal::Cli::Healthcheck::Error < StandardError
2+
end

lib/kamal/cli/healthcheck/poller.rb

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ module Kamal::Cli::Healthcheck::Poller
33

44
TRAEFIK_UPDATE_DELAY = 5
55

6-
class HealthcheckError < StandardError; end
76

87
def wait_for_healthy(pause_after_ready: false, &block)
98
attempt = 1
@@ -16,9 +15,9 @@ def wait_for_healthy(pause_after_ready: false, &block)
1615
when "running" # No health check configured
1716
sleep KAMAL.config.readiness_delay if pause_after_ready
1817
else
19-
raise HealthcheckError, "container not ready (#{status})"
18+
raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})"
2019
end
21-
rescue HealthcheckError => e
20+
rescue Kamal::Cli::Healthcheck::Error => e
2221
if attempt <= max_attempts
2322
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
2423
sleep attempt
@@ -41,9 +40,9 @@ def wait_for_unhealthy(pause_after_ready: false, &block)
4140
when "unhealthy"
4241
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
4342
else
44-
raise HealthcheckError, "container not unhealthy (#{status})"
43+
raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})"
4544
end
46-
rescue HealthcheckError => e
45+
rescue Kamal::Cli::Healthcheck::Error => e
4746
if attempt <= max_attempts
4847
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
4948
sleep attempt

lib/kamal/cli/main.rb

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,6 @@ def deploy
4141
say "Ensure Traefik is running...", :magenta
4242
invoke "kamal:cli:traefik:boot", [], invoke_options
4343

44-
if KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
45-
say "Ensure app can pass healthcheck...", :magenta
46-
invoke "kamal:cli:healthcheck:perform", [], invoke_options
47-
end
48-
4944
say "Detect stale containers...", :magenta
5045
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
5146

@@ -76,9 +71,6 @@ def redeploy
7671

7772
run_hook "pre-deploy"
7873

79-
say "Ensure app can pass healthcheck...", :magenta
80-
invoke "kamal:cli:healthcheck:perform", [], invoke_options
81-
8274
say "Detect stale containers...", :magenta
8375
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
8476

@@ -223,9 +215,6 @@ def version
223215
desc "env", "Manage environment files"
224216
subcommand "env", Kamal::Cli::Env
225217

226-
desc "healthcheck", "Healthcheck application"
227-
subcommand "healthcheck", Kamal::Cli::Healthcheck
228-
229218
desc "lock", "Manage the deploy lock"
230219
subcommand "lock", Kamal::Cli::Lock
231220

lib/kamal/commander.rb

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ def primary_role
6161
end
6262

6363
def roles
64-
(specific_roles || config.roles).select do |role|
65-
((specific_hosts || config.all_hosts) & role.hosts).any?
66-
end
64+
(specific_roles || config.roles) \
65+
.select { |role| ((specific_hosts || config.all_hosts) & role.hosts).any? }
66+
.sort_by { |role| role.running_traefik? ? 0 : 1 }
6767
end
6868

6969
def hosts
@@ -178,6 +178,7 @@ def configure_sshkit_with(config)
178178
sshkit.max_concurrent_starts = config.sshkit.max_concurrent_starts
179179
sshkit.ssh_options = config.ssh.options
180180
end
181+
SSHKit.config.default_runner = SSHKit::Runner::ParallelCompleteAll
181182
SSHKit.config.command_map[:docker] = "docker" # No need to use /usr/bin/env, just clogs up the logs
182183
SSHKit.config.output_verbosity = verbosity
183184
end

lib/kamal/commands/healthcheck.rb

Lines changed: 0 additions & 59 deletions
This file was deleted.

lib/kamal/configuration.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def sshkit
188188

189189

190190
def healthcheck
191-
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {})
191+
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {})
192192
end
193193

194194
def healthcheck_service

0 commit comments

Comments
 (0)