Skip to content

Commit

Permalink
dcrdtest: avoid concurrent exec.Cmd.Wait calls
Browse files Browse the repository at this point in the history
exec.Cmd.Wait is not safe to call concurrently.  Instead, call this method
only once, in the goroutine started by node.start() used for abnormal process
exit.  After Wait exits, assign the error and close a new cmdDone channel
added to the node struct.  This allows node.stop() to detect when process
shutdown has occurred, and provides access to the error so it can be logged.

Since some other parts of node.stop() were racy on reading node.cmd, and it
was unclear how it was intended to handle the difference between a node that
was never successfully started and one that was previously stopped, another
new channel cmdStarted is added to node to distinguish these conditions.
  • Loading branch information
jrick committed Apr 4, 2024
1 parent f6c4140 commit 841153e
Showing 1 changed file with 34 additions and 18 deletions.
52 changes: 34 additions & 18 deletions dcrdtest/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,12 +177,15 @@ func (n *nodeConfig) String() string {
type node struct {
config *nodeConfig

cmd *exec.Cmd
pidFile string
stderr io.ReadCloser
stdout io.ReadCloser
wg sync.WaitGroup
pid int
cmd *exec.Cmd
cmdStarted chan struct{} // Closed after cmd is started and assigned
cmdDone chan error // Sent error and closed after cmdErr is assigned
cmdErr error // Must only be read after receiving on cmdDone
pidFile string
stderr io.ReadCloser
stdout io.ReadCloser
wg sync.WaitGroup
pid int

// Locally bound addresses for the subsystems.
p2pAddr string
Expand All @@ -202,8 +205,10 @@ func (n *node) logf(format string, args ...interface{}) {
// as the base for the log and data directories for dcrd.
func newNode(config *nodeConfig, dataDir string) (*node, error) {
return &node{
config: config,
dataDir: dataDir,
config: config,
dataDir: dataDir,
cmdStarted: make(chan struct{}),
cmdDone: make(chan error, 1),
}, nil
}

Expand Down Expand Up @@ -309,6 +314,7 @@ func (n *node) start(ctx context.Context) error {
return fmt.Errorf("%w: %v", errDcrdCmdExec, err)
}
n.cmd = cmd
close(n.cmdStarted)
n.pid = n.cmd.Process.Pid

// Unblock pipes now that pid is available.
Expand All @@ -330,17 +336,21 @@ func (n *node) start(ctx context.Context) error {
return err
}

earlyShutdown := make(chan error, 1)
n.wg.Add(1)
go func() {
earlyShutdown <- cmd.Wait()
defer n.wg.Done()

n.cmdErr = n.cmd.Wait()
n.cmdDone <- n.cmdErr
close(n.cmdDone)
}()

// Read the RPC and P2P addresses.
select {
case <-ctx.Done():
_ = n.stop() // Cleanup what has been done so far.
return ctx.Err()
case err := <-earlyShutdown:
case err := <-n.cmdDone:
_ = n.stop()
return err
case <-gotSubsysAddrs:
Expand All @@ -358,12 +368,20 @@ func (n *node) stop() error {
log.Tracef("stop %p", n.cmd)
defer log.Tracef("stop done")

if n.cmd == nil || n.cmd.Process == nil {
// return if not properly initialized
// or error starting the process
select {
case <-n.cmdStarted:
default:
// has not been started (yet, or ever)
return nil
}

select {
case <-n.cmdDone:
// already stopped
return nil
default:
}

// Attempt a graceful dcrd shutdown by closing the pipeRX files.
err := n.config.pipeRX.close()
if err != nil {
Expand All @@ -389,8 +407,8 @@ func (n *node) stop() error {

// Wait for command to exit.
log.Tracef("stop cmd.Wait")
err = n.cmd.Wait()
if err != nil {
<-n.cmdDone
if err := n.cmdErr; err != nil {
log.Debugf("stop cmd.Wait error: %v", err)
}

Expand All @@ -399,8 +417,6 @@ func (n *node) stop() error {
n.logf("Unable to close pipe TX: %v", err)
}

// Mark command terminated.
n.cmd = nil
return nil
}

Expand Down

0 comments on commit 841153e

Please sign in to comment.