diff --git a/ai-track-docs/resilience.md b/ai-track-docs/resilience.md new file mode 100644 index 0000000..4710514 --- /dev/null +++ b/ai-track-docs/resilience.md @@ -0,0 +1,48 @@ +# Resilience — Ex15 + +## Improvement: Error Mapping in `knife status` + +`knife status` queries the Chef Server via `Chef::Search::Query#search`. Previously, any network or server error produced an unhandled exception with a Ruby stack trace — confusing for operators and hard to script around. + +**Ex15 adds a rescue block** that maps two failure modes to clear fatal messages and a clean `exit 1`: + +| Error | Cause | Output | +|-------|-------|--------| +| `Net::HTTPServerException` | Chef Server returns 5xx | `FATAL: Chef Server returned an error: ` | +| `SocketError` | DNS failure / network unreachable | `FATAL: Cannot reach Chef Server: ` | + +### Code Location + +```ruby +# lib/chef/knife/status.rb — run method +begin + q.search(:node, query, build_search_opts) { |node| all_nodes << node } +rescue Net::HTTPServerException => e + ui.fatal("Chef Server returned an error: #{e.message}") + exit 1 +rescue SocketError => e + ui.fatal("Cannot reach Chef Server: #{e.message}") + exit 1 +end +``` + +## Failure Tests + +Two tests in `spec/unit/knife/status_spec.rb` under `"resilience: error mapping"`: + +1. **HTTP 500** — stubs `Net::HTTPServerException`, expects `ui.fatal` + `SystemExit(1)` +2. **SocketError** — stubs `SocketError`, expects `ui.fatal` + `SystemExit(1)` + +### Running Locally + +```bash +bundle exec rspec spec/unit/knife/status_spec.rb -e "resilience" +# Expect: 2 examples, 0 failures + +bundle exec rspec spec/unit/knife/status_spec.rb +# Expect: 23 examples, 0 failures +``` + +## Why Error Mapping (Not Retry) + +`knife status` is a **read-only query** command. Retrying on 5xx risks masking persistent server issues and adds latency. The right behavior is to fail fast with a clear message so operators can investigate the server directly. Retry/backoff is appropriate for write operations or bootstrap flows (already present in `knife bootstrap`). diff --git a/lib/chef/knife/status.rb b/lib/chef/knife/status.rb index 3ef4df4..3906885 100644 --- a/lib/chef/knife/status.rb +++ b/lib/chef/knife/status.rb @@ -73,11 +73,19 @@ def run all_nodes = [] q = Chef::Search::Query.new - search_start = Process.clock_gettime(Process::CLOCK_MONOTONIC) - q.search(:node, query, build_search_opts) do |node| - all_nodes << node + begin + search_start = Process.clock_gettime(Process::CLOCK_MONOTONIC) + q.search(:node, query, build_search_opts) do |node| + all_nodes << node + end + search_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - search_start + rescue Net::HTTPServerException => e + ui.fatal("Chef Server returned an error: #{e.message}") + exit 1 + rescue SocketError => e + ui.fatal("Cannot reach Chef Server: #{e.message}") + exit 1 end - search_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - search_start if ENV["KNIFE_TIMING"] Chef::Log.info("op=knife_status status=ok nodes=#{all_nodes.size} elapsed_ms=#{(search_elapsed * 1000).round}") diff --git a/spec/unit/knife/status_spec.rb b/spec/unit/knife/status_spec.rb index d368b93..0219c41 100644 --- a/spec/unit/knife/status_spec.rb +++ b/spec/unit/knife/status_spec.rb @@ -199,4 +199,31 @@ end end end + + describe "resilience: error mapping" do + context "when the Chef Server returns an HTTP error" do + before do + response = double("Net::HTTPResponse", code: "500", message: "Internal Server Error") + allow(response).to receive(:body).and_return("") + error = Net::HTTPServerException.new("500 Internal Server Error", response) + allow(@query).to receive(:search).and_raise(error) + end + + it "prints a fatal message and exits 1" do + expect(@knife.ui).to receive(:fatal).with(/Chef Server returned an error/) + expect { @knife.run }.to raise_error(SystemExit) { |e| expect(e.status).to eq(1) } + end + end + + context "when the network is unreachable (SocketError)" do + before do + allow(@query).to receive(:search).and_raise(SocketError, "getaddrinfo: nodename nor servname provided") + end + + it "prints a fatal message and exits 1" do + expect(@knife.ui).to receive(:fatal).with(/Cannot reach Chef Server/) + expect { @knife.run }.to raise_error(SystemExit) { |e| expect(e.status).to eq(1) } + end + end + end end