Skip to content

Commit 0827218

Browse files
committed
Bail out after stop timeout
Previously, the shutdown code looped endlessly until the child process finished, requesting graceful termination over and over again. Change this to a single request-termination -> wait -> bail-out logic. This is to ensure that k0s won't hang when the supervised processes can't be terminated for whichever reason: the code will terminate, at least after the timeout expired. Use a buffered channel for the wait result, so that the goroutine will be able to exit, even if nothing reads from the channel anymore. Introduce fine-grained error reporting to differentiate shutdown outcomes (graceful shutdown, forced kill, failure, and so on). Signed-off-by: Tom Wieczorek <[email protected]>
1 parent 5c3cd44 commit 0827218

File tree

1 file changed

+64
-23
lines changed

1 file changed

+64
-23
lines changed

pkg/supervisor/supervisor.go

Lines changed: 64 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ const k0sManaged = "_K0S_MANAGED=yes"
5959
// processWaitQuit waits for a process to exit or a shut down signal
6060
// returns true if shutdown is requested
6161
func (s *Supervisor) processWaitQuit(ctx context.Context, cmd *exec.Cmd) bool {
62-
waitresult := make(chan error)
62+
waitresult := make(chan error, 1)
6363
go func() {
6464
waitresult <- cmd.Wait()
6565
}()
@@ -68,29 +68,14 @@ func (s *Supervisor) processWaitQuit(ctx context.Context, cmd *exec.Cmd) bool {
6868

6969
select {
7070
case <-ctx.Done():
71-
for {
72-
s.log.Debug("Requesting graceful termination")
73-
if err := requestGracefulTermination(cmd.Process); err != nil {
74-
if errors.Is(err, os.ErrProcessDone) {
75-
s.log.Info("Failed to request graceful termination: process has already terminated")
76-
} else {
77-
s.log.WithError(err).Error("Failed to request graceful termination")
78-
}
79-
} else {
80-
s.log.Info("Requested graceful termination")
81-
}
82-
select {
83-
case <-time.After(s.TimeoutStop):
84-
continue
85-
case err := <-waitresult:
86-
if err != nil {
87-
s.log.WithError(err).Error("Failed to wait for process")
88-
} else {
89-
s.log.Info("Process exited: ", s.cmd.ProcessState)
90-
}
91-
return true
92-
}
71+
err := s.terminateSupervisedProcess(cmd, waitresult)
72+
if err != nil {
73+
s.log.WithError(err).Error("Error while terminating process")
74+
} else {
75+
s.log.Info("Process terminated successfully")
9376
}
77+
return true
78+
9479
case err := <-waitresult:
9580
var exitErr *exec.ExitError
9681
state := cmd.ProcessState
@@ -107,6 +92,62 @@ func (s *Supervisor) processWaitQuit(ctx context.Context, cmd *exec.Cmd) bool {
10792
}
10893
}
10994

95+
func (s *Supervisor) terminateSupervisedProcess(cmd *exec.Cmd, waitresult <-chan error) error {
96+
s.log.Debug("Requesting graceful termination")
97+
err := requestGracefulTermination(cmd.Process)
98+
switch {
99+
case err == nil:
100+
// Termination request sent, wait for process to finish.
101+
s.log.Debug("Awaiting graceful process termination for ", s.TimeoutStop)
102+
103+
select {
104+
case err := <-waitresult:
105+
var exitErr *exec.ExitError
106+
switch {
107+
case err == nil:
108+
return nil
109+
case errors.As(err, &exitErr):
110+
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok && status.Signal() == syscall.SIGTERM {
111+
return errors.New("process terminated without handling SIGTERM")
112+
}
113+
return exitErr
114+
default:
115+
return fmt.Errorf("failed to wait for process: %w", err)
116+
}
117+
118+
case <-time.After(s.TimeoutStop):
119+
err = fmt.Errorf("timed out after %s while waiting for process to terminate", s.TimeoutStop)
120+
}
121+
122+
return err
123+
124+
case errors.Is(err, os.ErrProcessDone):
125+
// The process has finished even before the termination could be requested.
126+
select {
127+
case err = <-waitresult:
128+
var exitErr *exec.ExitError
129+
state := cmd.ProcessState
130+
switch {
131+
case errors.As(err, &exitErr):
132+
state = exitErr.ProcessState
133+
fallthrough
134+
case err == nil:
135+
err = errors.New(state.String())
136+
default:
137+
return fmt.Errorf("failed to wait for process: %s (%w)", state, err)
138+
}
139+
default:
140+
err = errors.New("process state unavailable")
141+
}
142+
143+
return fmt.Errorf("process terminated before graceful termination could be requested: %w", err)
144+
145+
default:
146+
// Something else went wrong
147+
return fmt.Errorf("failed to request graceful termination: %w", err)
148+
}
149+
}
150+
110151
// Supervise Starts supervising the given process
111152
func (s *Supervisor) Supervise() error {
112153
s.startStopMutex.Lock()

0 commit comments

Comments
 (0)