Skip to content

Commit c4b5181

Browse files
committed
Bail out after stop timeout
Previously, the shutdown code looped endlessly until the child process finished, requesting graceful termination over and over again. Change this to a single request-termination -> wait -> bail-out logic. This is to ensure that k0s won't hang when the supervised processes can't be terminated for whichever reason: the code will terminate, at least after the timeout expired. Use a buffered channel for the wait result, so that the goroutine will be able to exit, even if nothing reads from the channel anymore. Introduce fine-grained error reporting to differentiate shutdown outcomes (graceful shutdown, forced kill, failure, and so on). Signed-off-by: Tom Wieczorek <[email protected]>
1 parent 509dcce commit c4b5181

File tree

1 file changed

+65
-23
lines changed

1 file changed

+65
-23
lines changed

pkg/supervisor/supervisor.go

Lines changed: 65 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"strconv"
1616
"strings"
1717
"sync"
18+
"syscall"
1819
"time"
1920

2021
"github.com/sirupsen/logrus"
@@ -55,7 +56,7 @@ const k0sManaged = "_K0S_MANAGED=yes"
5556
// processWaitQuit waits for a process to exit or a shut down signal
5657
// returns true if shutdown is requested
5758
func (s *Supervisor) processWaitQuit(ctx context.Context, cmd *exec.Cmd) bool {
58-
waitresult := make(chan error)
59+
waitresult := make(chan error, 1)
5960
go func() {
6061
waitresult <- cmd.Wait()
6162
}()
@@ -64,29 +65,14 @@ func (s *Supervisor) processWaitQuit(ctx context.Context, cmd *exec.Cmd) bool {
6465

6566
select {
6667
case <-ctx.Done():
67-
for {
68-
s.log.Debug("Requesting graceful termination")
69-
if err := requestGracefulTermination(cmd.Process); err != nil {
70-
if errors.Is(err, os.ErrProcessDone) {
71-
s.log.Info("Failed to request graceful termination: process has already terminated")
72-
} else {
73-
s.log.WithError(err).Error("Failed to request graceful termination")
74-
}
75-
} else {
76-
s.log.Info("Requested graceful termination")
77-
}
78-
select {
79-
case <-time.After(s.TimeoutStop):
80-
continue
81-
case err := <-waitresult:
82-
if err != nil {
83-
s.log.WithError(err).Error("Failed to wait for process")
84-
} else {
85-
s.log.Info("Process exited: ", s.cmd.ProcessState)
86-
}
87-
return true
88-
}
68+
err := s.terminateSupervisedProcess(cmd, waitresult)
69+
if err != nil {
70+
s.log.WithError(err).Error("Error while terminating process")
71+
} else {
72+
s.log.Info("Process terminated successfully")
8973
}
74+
return true
75+
9076
case err := <-waitresult:
9177
var exitErr *exec.ExitError
9278
state := cmd.ProcessState
@@ -103,6 +89,62 @@ func (s *Supervisor) processWaitQuit(ctx context.Context, cmd *exec.Cmd) bool {
10389
}
10490
}
10591

92+
func (s *Supervisor) terminateSupervisedProcess(cmd *exec.Cmd, waitresult <-chan error) error {
93+
s.log.Debug("Requesting graceful termination")
94+
err := requestGracefulTermination(cmd.Process)
95+
switch {
96+
case err == nil:
97+
// Termination request sent, wait for process to finish.
98+
s.log.Debug("Awaiting graceful process termination for ", s.TimeoutStop)
99+
100+
select {
101+
case err := <-waitresult:
102+
var exitErr *exec.ExitError
103+
switch {
104+
case err == nil:
105+
return nil
106+
case errors.As(err, &exitErr):
107+
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok && status.Signal() == syscall.SIGTERM {
108+
return errors.New("process terminated without handling SIGTERM")
109+
}
110+
return exitErr
111+
default:
112+
return fmt.Errorf("failed to wait for process: %w", err)
113+
}
114+
115+
case <-time.After(s.TimeoutStop):
116+
err = fmt.Errorf("timed out after %s while waiting for process to terminate", s.TimeoutStop)
117+
}
118+
119+
return err
120+
121+
case errors.Is(err, os.ErrProcessDone):
122+
// The process has finished even before the termination could be requested.
123+
select {
124+
case err = <-waitresult:
125+
var exitErr *exec.ExitError
126+
state := cmd.ProcessState
127+
switch {
128+
case errors.As(err, &exitErr):
129+
state = exitErr.ProcessState
130+
fallthrough
131+
case err == nil:
132+
err = errors.New(state.String())
133+
default:
134+
return fmt.Errorf("failed to wait for process: %s (%w)", state, err)
135+
}
136+
default:
137+
err = errors.New("process state unavailable")
138+
}
139+
140+
return fmt.Errorf("process terminated before graceful termination could be requested: %w", err)
141+
142+
default:
143+
// Something else went wrong
144+
return fmt.Errorf("failed to request graceful termination: %w", err)
145+
}
146+
}
147+
106148
// Supervise Starts supervising the given process
107149
func (s *Supervisor) Supervise() error {
108150
s.startStopMutex.Lock()

0 commit comments

Comments
 (0)